GNU Radio Manual and C++ API Reference  3.7.6.1
The Free & Open Software Radio Ecosystem
volk_32f_tan_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 #include <stdio.h>
24 #include <math.h>
25 #include <inttypes.h>
26 
27 #ifndef INCLUDED_volk_32f_tan_32f_a_H
28 #define INCLUDED_volk_32f_tan_32f_a_H
29 
30 #ifdef LV_HAVE_SSE4_1
31 #include <smmintrin.h>
32 /*!
33  \brief Computes tangent of input vector and stores results in output vector
34  \param bVector The vector where results will be stored
35  \param aVector The input vector of floats
36  \param num_points Number of points for which tangent is to be computed
37 */
38 static inline void volk_32f_tan_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points){
39 
40  float* bPtr = bVector;
41  const float* aPtr = aVector;
42 
43  unsigned int number = 0;
44  unsigned int quarterPoints = num_points / 4;
45  unsigned int i = 0;
46 
47  __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
48  __m128 sine, cosine, tangent, condition1, condition2, condition3;
49  __m128i q, r, ones, twos, fours;
50 
51  m4pi = _mm_set1_ps(1.273239545);
52  pio4A = _mm_set1_ps(0.78515625);
53  pio4B = _mm_set1_ps(0.241876e-3);
54  ffours = _mm_set1_ps(4.0);
55  ftwos = _mm_set1_ps(2.0);
56  fones = _mm_set1_ps(1.0);
57  fzeroes = _mm_setzero_ps();
58  ones = _mm_set1_epi32(1);
59  twos = _mm_set1_epi32(2);
60  fours = _mm_set1_epi32(4);
61 
62  cp1 = _mm_set1_ps(1.0);
63  cp2 = _mm_set1_ps(0.83333333e-1);
64  cp3 = _mm_set1_ps(0.2777778e-2);
65  cp4 = _mm_set1_ps(0.49603e-4);
66  cp5 = _mm_set1_ps(0.551e-6);
67 
68  for(;number < quarterPoints; number++){
69  aVal = _mm_load_ps(aPtr);
70  s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
71  q = _mm_cvtps_epi32(_mm_mul_ps(s, m4pi));
72  r = _mm_add_epi32(q, _mm_and_si128(q, ones));
73 
74  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
75  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
76 
77  s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
78  s = _mm_mul_ps(s, s);
79  // Evaluate Taylor series
80  s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
81 
82  for(i = 0; i < 3; i++){
83  s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
84  }
85  s = _mm_div_ps(s, ftwos);
86 
87  sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
88  cosine = _mm_sub_ps(fones, s);
89 
90  condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
91  condition2 = _mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), _mm_cmplt_ps(aVal, fzeroes));
92  condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
93 
94  __m128 temp = cosine;
95  cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
96  sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1));
97  sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
98  cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
99  tangent = _mm_div_ps(sine, cosine);
100  _mm_store_ps(bPtr, tangent);
101  aPtr += 4;
102  bPtr += 4;
103  }
104 
105  number = quarterPoints * 4;
106  for(;number < num_points; number++){
107  *bPtr++ = tan(*aPtr++);
108  }
109 }
110 
111 #endif /* LV_HAVE_SSE4_1 for aligned */
112 
113 #endif /* INCLUDED_volk_32f_tan_32f_a_H */
114 
115 #ifndef INCLUDED_volk_32f_tan_32f_u_H
116 #define INCLUDED_volk_32f_tan_32f_u_H
117 
118 #ifdef LV_HAVE_SSE4_1
119 #include <smmintrin.h>
120 /*!
121  \brief Computes tangent of input vector and stores results in output vector
122  \param bVector The vector where results will be stored
123  \param aVector The input vector of floats
124  \param num_points Number of points for which tangent is to be computed
125 */
126 static inline void volk_32f_tan_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points){
127 
128  float* bPtr = bVector;
129  const float* aPtr = aVector;
130 
131  unsigned int number = 0;
132  unsigned int quarterPoints = num_points / 4;
133  unsigned int i = 0;
134 
135  __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
136  __m128 sine, cosine, tangent, condition1, condition2, condition3;
137  __m128i q, r, ones, twos, fours;
138 
139  m4pi = _mm_set1_ps(1.273239545);
140  pio4A = _mm_set1_ps(0.78515625);
141  pio4B = _mm_set1_ps(0.241876e-3);
142  ffours = _mm_set1_ps(4.0);
143  ftwos = _mm_set1_ps(2.0);
144  fones = _mm_set1_ps(1.0);
145  fzeroes = _mm_setzero_ps();
146  ones = _mm_set1_epi32(1);
147  twos = _mm_set1_epi32(2);
148  fours = _mm_set1_epi32(4);
149 
150  cp1 = _mm_set1_ps(1.0);
151  cp2 = _mm_set1_ps(0.83333333e-1);
152  cp3 = _mm_set1_ps(0.2777778e-2);
153  cp4 = _mm_set1_ps(0.49603e-4);
154  cp5 = _mm_set1_ps(0.551e-6);
155 
156  for(;number < quarterPoints; number++){
157  aVal = _mm_loadu_ps(aPtr);
158  s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
159  q = _mm_cvtps_epi32(_mm_mul_ps(s, m4pi));
160  r = _mm_add_epi32(q, _mm_and_si128(q, ones));
161 
162  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
163  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
164 
165  s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
166  s = _mm_mul_ps(s, s);
167  // Evaluate Taylor series
168  s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
169 
170  for(i = 0; i < 3; i++){
171  s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
172  }
173  s = _mm_div_ps(s, ftwos);
174 
175  sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
176  cosine = _mm_sub_ps(fones, s);
177 
178  condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
179  condition2 = _mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), _mm_cmplt_ps(aVal, fzeroes));
180  condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
181 
182  __m128 temp = cosine;
183  cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
184  sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1));
185  sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
186  cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
187  tangent = _mm_div_ps(sine, cosine);
188  _mm_storeu_ps(bPtr, tangent);
189  aPtr += 4;
190  bPtr += 4;
191  }
192 
193  number = quarterPoints * 4;
194  for(;number < num_points; number++){
195  *bPtr++ = tan(*aPtr++);
196  }
197 }
198 
199 #endif /* LV_HAVE_SSE4_1 for unaligned */
200 
201 #ifdef LV_HAVE_GENERIC
202 /*!
203  \brief Computes tangent of input vector and stores results in output vector
204  \param bVector The vector where results will be stored
205  \param aVector The input vector of floats
206  \param num_points Number of points for which tangent is to be computed
207 */
208 static inline void volk_32f_tan_32f_generic(float* bVector, const float* aVector, unsigned int num_points){
209  float* bPtr = bVector;
210  const float* aPtr = aVector;
211  unsigned int number = 0;
212 
213  for(; number < num_points; number++){
214  *bPtr++ = tan(*aPtr++);
215  }
216 
217 }
218 #endif /* LV_HAVE_GENERIC */
219 
220 #endif /* INCLUDED_volk_32f_tan_32f_u_H */