30 #ifndef INCLUDED_volk_32f_acos_32f_a_H
31 #define INCLUDED_volk_32f_acos_32f_a_H
34 #include <smmintrin.h>
41 static inline void volk_32f_acos_32f_a_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points){
43 float* bPtr = bVector;
44 const float* aPtr = aVector;
46 unsigned int number = 0;
47 unsigned int quarterPoints = num_points / 4;
50 __m128 aVal, d, pi, pio2, x, y, z, arccosine;
51 __m128 fzeroes, fones, ftwos, ffours, condition;
53 pi = _mm_set1_ps(3.14159265358979323846);
54 pio2 = _mm_set1_ps(3.14159265358979323846/2);
55 fzeroes = _mm_setzero_ps();
56 fones = _mm_set1_ps(1.0);
57 ftwos = _mm_set1_ps(2.0);
58 ffours = _mm_set1_ps(4.0);
60 for(;number < quarterPoints; number++){
61 aVal = _mm_load_ps(aPtr);
63 aVal = _mm_div_ps(_mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))), aVal);
65 condition = _mm_cmplt_ps(z, fzeroes);
66 z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
68 condition = _mm_cmplt_ps(z, fones);
69 x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
71 for(i = 0; i < 2; i++)
72 x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
73 x = _mm_div_ps(fones, x);
76 y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
78 y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
79 condition = _mm_cmpgt_ps(z, fones);
81 y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
83 condition = _mm_cmplt_ps(aVal, fzeroes);
84 arccosine = _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
85 condition = _mm_cmplt_ps(d, fzeroes);
86 arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
88 _mm_store_ps(bPtr, arccosine);
93 number = quarterPoints * 4;
94 for(;number < num_points; number++){
95 *bPtr++ = acos(*aPtr++);
104 #ifndef INCLUDED_volk_32f_acos_32f_u_H
105 #define INCLUDED_volk_32f_acos_32f_u_H
107 #ifdef LV_HAVE_SSE4_1
108 #include <smmintrin.h>
115 static inline void volk_32f_acos_32f_u_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points){
116 float* bPtr = bVector;
117 const float* aPtr = aVector;
119 unsigned int number = 0;
120 unsigned int quarterPoints = num_points / 4;
123 __m128 aVal, d, pi, pio2, x, y, z, arccosine;
124 __m128 fzeroes, fones, ftwos, ffours, condition;
126 pi = _mm_set1_ps(3.14159265358979323846);
127 pio2 = _mm_set1_ps(3.14159265358979323846/2);
128 fzeroes = _mm_setzero_ps();
129 fones = _mm_set1_ps(1.0);
130 ftwos = _mm_set1_ps(2.0);
131 ffours = _mm_set1_ps(4.0);
133 for(;number < quarterPoints; number++){
134 aVal = _mm_loadu_ps(aPtr);
136 aVal = _mm_div_ps(_mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))), aVal);
138 condition = _mm_cmplt_ps(z, fzeroes);
139 z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
141 condition = _mm_cmplt_ps(z, fones);
142 x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
144 for(i = 0; i < 2; i++)
145 x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
146 x = _mm_div_ps(fones, x);
150 y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
152 y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
153 condition = _mm_cmpgt_ps(z, fones);
155 y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
157 condition = _mm_cmplt_ps(aVal, fzeroes);
158 arccosine = _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
159 condition = _mm_cmplt_ps(d, fzeroes);
160 arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
162 _mm_storeu_ps(bPtr, arccosine);
167 number = quarterPoints * 4;
168 for(;number < num_points; number++){
169 *bPtr++ = acos(*aPtr++);
175 #ifdef LV_HAVE_GENERIC
182 static inline void volk_32f_acos_32f_generic(
float* bVector,
const float* aVector,
unsigned int num_points){
183 float* bPtr = bVector;
184 const float* aPtr = aVector;
185 unsigned int number = 0;
187 for(number = 0; number < num_points; number++){
188 *bPtr++ = acos(*aPtr++);
#define ACOS_TERMS
Definition: volk_32f_acos_32f.h:28