23 #ifndef INCLUDED_volk_32f_tanh_32f_a_H
24 #define INCLUDED_volk_32f_tanh_32f_a_H
31 #ifdef LV_HAVE_GENERIC
38 static inline void volk_32f_tanh_32f_generic(
float* cVector,
const float* aVector,
39 unsigned int num_points)
41 unsigned int number = 0;
42 float* cPtr = cVector;
43 const float* aPtr = aVector;
44 for(; number < num_points; number++) {
45 *cPtr++ = tanh(*aPtr++);
52 #ifdef LV_HAVE_GENERIC
59 static inline void volk_32f_tanh_32f_series(
float* cVector,
const float* aVector,
60 unsigned int num_points)
62 unsigned int number = 0;
63 float* cPtr = cVector;
64 const float* aPtr = aVector;
65 for(; number < num_points; number++) {
68 else if(*aPtr <= -4.97)
71 float x2 = (*aPtr) * (*aPtr);
72 float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
73 float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
85 #include <xmmintrin.h>
92 static inline void volk_32f_tanh_32f_a_sse(
float* cVector,
const float* aVector,
93 unsigned int num_points)
95 unsigned int number = 0;
96 const unsigned int quarterPoints = num_points / 4;
98 float* cPtr = cVector;
99 const float* aPtr = aVector;
101 __m128 aVal, cVal, x2, a, b;
102 __m128 const1, const2, const3, const4, const5, const6;
103 const1 = _mm_set_ps1(135135.0f);
104 const2 = _mm_set_ps1(17325.0f);
105 const3 = _mm_set_ps1(378.0f);
106 const4 = _mm_set_ps1(62370.0f);
107 const5 = _mm_set_ps1(3150.0f);
108 const6 = _mm_set_ps1(28.0f);
109 for(;number < quarterPoints; number++){
111 aVal = _mm_load_ps(aPtr);
112 x2 = _mm_mul_ps(aVal, aVal);
113 a = _mm_mul_ps(aVal, _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
114 b = _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const4, _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
116 cVal = _mm_div_ps(a, b);
118 _mm_store_ps(cPtr, cVal);
124 number = quarterPoints * 4;
125 for(;number < num_points; number++) {
128 else if(*aPtr <= -4.97)
131 float x2 = (*aPtr) * (*aPtr);
132 float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
133 float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
143 #include <immintrin.h>
150 static inline void volk_32f_tanh_32f_a_avx(
float* cVector,
const float* aVector,
151 unsigned int num_points)
153 unsigned int number = 0;
154 const unsigned int eighthPoints = num_points / 8;
156 float* cPtr = cVector;
157 const float* aPtr = aVector;
159 __m256 aVal, cVal, x2, a, b;
160 __m256 const1, const2, const3, const4, const5, const6;
161 const1 = _mm256_set1_ps(135135.0f);
162 const2 = _mm256_set1_ps(17325.0f);
163 const3 = _mm256_set1_ps(378.0f);
164 const4 = _mm256_set1_ps(62370.0f);
165 const5 = _mm256_set1_ps(3150.0f);
166 const6 = _mm256_set1_ps(28.0f);
167 for(;number < eighthPoints; number++){
169 aVal = _mm256_load_ps(aPtr);
170 x2 = _mm256_mul_ps(aVal, aVal);
171 a = _mm256_mul_ps(aVal, _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const2, _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
172 b = _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const4, _mm256_mul_ps(x2, _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
174 cVal = _mm256_div_ps(a, b);
176 _mm256_store_ps(cPtr, cVal);
182 number = eighthPoints * 8;
183 for(;number < num_points; number++) {
186 else if(*aPtr <= -4.97)
189 float x2 = (*aPtr) * (*aPtr);
190 float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
191 float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
203 #include <xmmintrin.h>
210 static inline void volk_32f_tanh_32f_u_sse(
float* cVector,
const float* aVector,
211 unsigned int num_points)
213 unsigned int number = 0;
214 const unsigned int quarterPoints = num_points / 4;
216 float* cPtr = cVector;
217 const float* aPtr = aVector;
219 __m128 aVal, cVal, x2, a, b;
220 __m128 const1, const2, const3, const4, const5, const6;
221 const1 = _mm_set_ps1(135135.0f);
222 const2 = _mm_set_ps1(17325.0f);
223 const3 = _mm_set_ps1(378.0f);
224 const4 = _mm_set_ps1(62370.0f);
225 const5 = _mm_set_ps1(3150.0f);
226 const6 = _mm_set_ps1(28.0f);
227 for(;number < quarterPoints; number++){
229 aVal = _mm_loadu_ps(aPtr);
230 x2 = _mm_mul_ps(aVal, aVal);
231 a = _mm_mul_ps(aVal, _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
232 b = _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const4, _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
234 cVal = _mm_div_ps(a, b);
236 _mm_storeu_ps(cPtr, cVal);
242 number = quarterPoints * 4;
243 for(;number < num_points; number++) {
246 else if(*aPtr <= -4.97)
249 float x2 = (*aPtr) * (*aPtr);
250 float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
251 float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
262 #include <immintrin.h>
269 static inline void volk_32f_tanh_32f_u_avx(
float* cVector,
const float* aVector,
270 unsigned int num_points)
272 unsigned int number = 0;
273 const unsigned int eighthPoints = num_points / 8;
275 float* cPtr = cVector;
276 const float* aPtr = aVector;
278 __m256 aVal, cVal, x2, a, b;
279 __m256 const1, const2, const3, const4, const5, const6;
280 const1 = _mm256_set1_ps(135135.0f);
281 const2 = _mm256_set1_ps(17325.0f);
282 const3 = _mm256_set1_ps(378.0f);
283 const4 = _mm256_set1_ps(62370.0f);
284 const5 = _mm256_set1_ps(3150.0f);
285 const6 = _mm256_set1_ps(28.0f);
286 for(;number < eighthPoints; number++){
288 aVal = _mm256_loadu_ps(aPtr);
289 x2 = _mm256_mul_ps(aVal, aVal);
290 a = _mm256_mul_ps(aVal, _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const2, _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
291 b = _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const4, _mm256_mul_ps(x2, _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
293 cVal = _mm256_div_ps(a, b);
295 _mm256_storeu_ps(cPtr, cVal);
301 number = eighthPoints * 8;
302 for(;number < num_points; number++) {
305 else if(*aPtr <= -4.97)
308 float x2 = (*aPtr) * (*aPtr);
309 float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
310 float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));