GNU Radio Manual and C++ API Reference  3.7.6.1
The Free & Open Software Radio Ecosystem
volk_32f_acos_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 #include <stdio.h>
24 #include <math.h>
25 #include <inttypes.h>
26 
27 /* This is the number of terms of Taylor series to evaluate, increase this for more accuracy*/
28 #define ACOS_TERMS 2
29 
30 #ifndef INCLUDED_volk_32f_acos_32f_a_H
31 #define INCLUDED_volk_32f_acos_32f_a_H
32 
33 #ifdef LV_HAVE_SSE4_1
34 #include <smmintrin.h>
35 /*!
36  \brief Computes arccosine of input vector and stores results in output vector
37  \param bVector The vector where results will be stored
38  \param aVector The input vector of floats
39  \param num_points Number of points for which arccosine is to be computed
40 */
41 static inline void volk_32f_acos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points){
42 
43  float* bPtr = bVector;
44  const float* aPtr = aVector;
45 
46  unsigned int number = 0;
47  unsigned int quarterPoints = num_points / 4;
48  int i, j;
49 
50  __m128 aVal, d, pi, pio2, x, y, z, arccosine;
51  __m128 fzeroes, fones, ftwos, ffours, condition;
52 
53  pi = _mm_set1_ps(3.14159265358979323846);
54  pio2 = _mm_set1_ps(3.14159265358979323846/2);
55  fzeroes = _mm_setzero_ps();
56  fones = _mm_set1_ps(1.0);
57  ftwos = _mm_set1_ps(2.0);
58  ffours = _mm_set1_ps(4.0);
59 
60  for(;number < quarterPoints; number++){
61  aVal = _mm_load_ps(aPtr);
62  d = aVal;
63  aVal = _mm_div_ps(_mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))), aVal);
64  z = aVal;
65  condition = _mm_cmplt_ps(z, fzeroes);
66  z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
67  x = z;
68  condition = _mm_cmplt_ps(z, fones);
69  x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
70 
71  for(i = 0; i < 2; i++)
72  x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
73  x = _mm_div_ps(fones, x);
74  y = fzeroes;
75  for(j = ACOS_TERMS - 1; j >=0 ; j--)
76  y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
77 
78  y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
79  condition = _mm_cmpgt_ps(z, fones);
80 
81  y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
82  arccosine = y;
83  condition = _mm_cmplt_ps(aVal, fzeroes);
84  arccosine = _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
85  condition = _mm_cmplt_ps(d, fzeroes);
86  arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
87 
88  _mm_store_ps(bPtr, arccosine);
89  aPtr += 4;
90  bPtr += 4;
91  }
92 
93  number = quarterPoints * 4;
94  for(;number < num_points; number++){
95  *bPtr++ = acos(*aPtr++);
96  }
97 }
98 
99 #endif /* LV_HAVE_SSE4_1 for aligned */
100 
101 #endif /* INCLUDED_volk_32f_acos_32f_a_H */
102 
103 
104 #ifndef INCLUDED_volk_32f_acos_32f_u_H
105 #define INCLUDED_volk_32f_acos_32f_u_H
106 
107 #ifdef LV_HAVE_SSE4_1
108 #include <smmintrin.h>
109 /*!
110  \brief Computes arccosine of input vector and stores results in output vector
111  \param bVector The vector where results will be stored
112  \param aVector The input vector of floats
113  \param num_points Number of points for which arccosine is to be computed
114 */
115 static inline void volk_32f_acos_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points){
116  float* bPtr = bVector;
117  const float* aPtr = aVector;
118 
119  unsigned int number = 0;
120  unsigned int quarterPoints = num_points / 4;
121  int i, j;
122 
123  __m128 aVal, d, pi, pio2, x, y, z, arccosine;
124  __m128 fzeroes, fones, ftwos, ffours, condition;
125 
126  pi = _mm_set1_ps(3.14159265358979323846);
127  pio2 = _mm_set1_ps(3.14159265358979323846/2);
128  fzeroes = _mm_setzero_ps();
129  fones = _mm_set1_ps(1.0);
130  ftwos = _mm_set1_ps(2.0);
131  ffours = _mm_set1_ps(4.0);
132 
133  for(;number < quarterPoints; number++){
134  aVal = _mm_loadu_ps(aPtr);
135  d = aVal;
136  aVal = _mm_div_ps(_mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))), aVal);
137  z = aVal;
138  condition = _mm_cmplt_ps(z, fzeroes);
139  z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
140  x = z;
141  condition = _mm_cmplt_ps(z, fones);
142  x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
143 
144  for(i = 0; i < 2; i++)
145  x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
146  x = _mm_div_ps(fones, x);
147  y = fzeroes;
148 
149  for(j = ACOS_TERMS - 1; j >=0 ; j--)
150  y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
151 
152  y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
153  condition = _mm_cmpgt_ps(z, fones);
154 
155  y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
156  arccosine = y;
157  condition = _mm_cmplt_ps(aVal, fzeroes);
158  arccosine = _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
159  condition = _mm_cmplt_ps(d, fzeroes);
160  arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
161 
162  _mm_storeu_ps(bPtr, arccosine);
163  aPtr += 4;
164  bPtr += 4;
165  }
166 
167  number = quarterPoints * 4;
168  for(;number < num_points; number++){
169  *bPtr++ = acos(*aPtr++);
170  }
171 }
172 
173 #endif /* LV_HAVE_SSE4_1 for aligned */
174 
175 #ifdef LV_HAVE_GENERIC
176 /*!
177  \brief Computes arccosine of input vector and stores results in output vector
178  \param bVector The vector where results will be stored
179  \param aVector The input vector of floats
180  \param num_points Number of points for which arccosine is to be computed
181 */
182 static inline void volk_32f_acos_32f_generic(float* bVector, const float* aVector, unsigned int num_points){
183  float* bPtr = bVector;
184  const float* aPtr = aVector;
185  unsigned int number = 0;
186 
187  for(number = 0; number < num_points; number++){
188  *bPtr++ = acos(*aPtr++);
189  }
190 
191 }
192 #endif /* LV_HAVE_GENERIC */
193 
194 #endif /* INCLUDED_volk_32f_acos_32f_u_H */
#define ACOS_TERMS
Definition: volk_32f_acos_32f.h:28