27 #ifndef INCLUDED_volk_32f_tan_32f_a_H
28 #define INCLUDED_volk_32f_tan_32f_a_H
31 #include <smmintrin.h>
38 static inline void volk_32f_tan_32f_a_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points){
40 float* bPtr = bVector;
41 const float* aPtr = aVector;
43 unsigned int number = 0;
44 unsigned int quarterPoints = num_points / 4;
47 __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
48 __m128 sine, cosine, tangent, condition1, condition2, condition3;
49 __m128i q, r, ones, twos, fours;
51 m4pi = _mm_set1_ps(1.273239545);
52 pio4A = _mm_set1_ps(0.78515625);
53 pio4B = _mm_set1_ps(0.241876e-3);
54 ffours = _mm_set1_ps(4.0);
55 ftwos = _mm_set1_ps(2.0);
56 fones = _mm_set1_ps(1.0);
57 fzeroes = _mm_setzero_ps();
58 ones = _mm_set1_epi32(1);
59 twos = _mm_set1_epi32(2);
60 fours = _mm_set1_epi32(4);
62 cp1 = _mm_set1_ps(1.0);
63 cp2 = _mm_set1_ps(0.83333333e-1);
64 cp3 = _mm_set1_ps(0.2777778e-2);
65 cp4 = _mm_set1_ps(0.49603e-4);
66 cp5 = _mm_set1_ps(0.551e-6);
68 for(;number < quarterPoints; number++){
69 aVal = _mm_load_ps(aPtr);
70 s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
71 q = _mm_cvtps_epi32(_mm_mul_ps(s, m4pi));
72 r = _mm_add_epi32(q, _mm_and_si128(q, ones));
74 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
75 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
77 s = _mm_div_ps(s, _mm_set1_ps(8.0));
80 s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
82 for(i = 0; i < 3; i++){
83 s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
85 s = _mm_div_ps(s, ftwos);
87 sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
88 cosine = _mm_sub_ps(fones, s);
90 condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
91 condition2 = _mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), _mm_cmplt_ps(aVal, fzeroes));
92 condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
95 cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
96 sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1));
97 sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
98 cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
99 tangent = _mm_div_ps(sine, cosine);
100 _mm_store_ps(bPtr, tangent);
105 number = quarterPoints * 4;
106 for(;number < num_points; number++){
107 *bPtr++ = tan(*aPtr++);
115 #ifndef INCLUDED_volk_32f_tan_32f_u_H
116 #define INCLUDED_volk_32f_tan_32f_u_H
118 #ifdef LV_HAVE_SSE4_1
119 #include <smmintrin.h>
126 static inline void volk_32f_tan_32f_u_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points){
128 float* bPtr = bVector;
129 const float* aPtr = aVector;
131 unsigned int number = 0;
132 unsigned int quarterPoints = num_points / 4;
135 __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
136 __m128 sine, cosine, tangent, condition1, condition2, condition3;
137 __m128i q, r, ones, twos, fours;
139 m4pi = _mm_set1_ps(1.273239545);
140 pio4A = _mm_set1_ps(0.78515625);
141 pio4B = _mm_set1_ps(0.241876e-3);
142 ffours = _mm_set1_ps(4.0);
143 ftwos = _mm_set1_ps(2.0);
144 fones = _mm_set1_ps(1.0);
145 fzeroes = _mm_setzero_ps();
146 ones = _mm_set1_epi32(1);
147 twos = _mm_set1_epi32(2);
148 fours = _mm_set1_epi32(4);
150 cp1 = _mm_set1_ps(1.0);
151 cp2 = _mm_set1_ps(0.83333333e-1);
152 cp3 = _mm_set1_ps(0.2777778e-2);
153 cp4 = _mm_set1_ps(0.49603e-4);
154 cp5 = _mm_set1_ps(0.551e-6);
156 for(;number < quarterPoints; number++){
157 aVal = _mm_loadu_ps(aPtr);
158 s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
159 q = _mm_cvtps_epi32(_mm_mul_ps(s, m4pi));
160 r = _mm_add_epi32(q, _mm_and_si128(q, ones));
162 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
163 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
165 s = _mm_div_ps(s, _mm_set1_ps(8.0));
166 s = _mm_mul_ps(s, s);
168 s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
170 for(i = 0; i < 3; i++){
171 s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
173 s = _mm_div_ps(s, ftwos);
175 sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
176 cosine = _mm_sub_ps(fones, s);
178 condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
179 condition2 = _mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), _mm_cmplt_ps(aVal, fzeroes));
180 condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
182 __m128 temp = cosine;
183 cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
184 sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1));
185 sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
186 cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
187 tangent = _mm_div_ps(sine, cosine);
188 _mm_storeu_ps(bPtr, tangent);
193 number = quarterPoints * 4;
194 for(;number < num_points; number++){
195 *bPtr++ = tan(*aPtr++);
201 #ifdef LV_HAVE_GENERIC
208 static inline void volk_32f_tan_32f_generic(
float* bVector,
const float* aVector,
unsigned int num_points){
209 float* bPtr = bVector;
210 const float* aPtr = aVector;
211 unsigned int number = 0;
213 for(; number < num_points; number++){
214 *bPtr++ = tan(*aPtr++);