30 #ifndef INCLUDED_volk_32f_asin_32f_a_H
31 #define INCLUDED_volk_32f_asin_32f_a_H
34 #include <smmintrin.h>
41 static inline void volk_32f_asin_32f_a_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points){
43 float* bPtr = bVector;
44 const float* aPtr = aVector;
46 unsigned int number = 0;
47 unsigned int quarterPoints = num_points / 4;
50 __m128 aVal, pio2, x, y, z, arcsine;
51 __m128 fzeroes, fones, ftwos, ffours, condition;
53 pio2 = _mm_set1_ps(3.14159265358979323846/2);
54 fzeroes = _mm_setzero_ps();
55 fones = _mm_set1_ps(1.0);
56 ftwos = _mm_set1_ps(2.0);
57 ffours = _mm_set1_ps(4.0);
59 for(;number < quarterPoints; number++){
60 aVal = _mm_load_ps(aPtr);
61 aVal = _mm_div_ps(aVal, _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))));
63 condition = _mm_cmplt_ps(z, fzeroes);
64 z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
66 condition = _mm_cmplt_ps(z, fones);
67 x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
69 for(i = 0; i < 2; i++){
70 x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
72 x = _mm_div_ps(fones, x);
75 y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
78 y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
79 condition = _mm_cmpgt_ps(z, fones);
81 y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
83 condition = _mm_cmplt_ps(aVal, fzeroes);
84 arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), condition));
86 _mm_store_ps(bPtr, arcsine);
91 number = quarterPoints * 4;
92 for(;number < num_points; number++){
93 *bPtr++ = asin(*aPtr++);
101 #ifndef INCLUDED_volk_32f_asin_32f_u_H
102 #define INCLUDED_volk_32f_asin_32f_u_H
104 #ifdef LV_HAVE_SSE4_1
105 #include <smmintrin.h>
112 static inline void volk_32f_asin_32f_u_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points){
114 float* bPtr = bVector;
115 const float* aPtr = aVector;
117 unsigned int number = 0;
118 unsigned int quarterPoints = num_points / 4;
121 __m128 aVal, pio2, x, y, z, arcsine;
122 __m128 fzeroes, fones, ftwos, ffours, condition;
124 pio2 = _mm_set1_ps(3.14159265358979323846/2);
125 fzeroes = _mm_setzero_ps();
126 fones = _mm_set1_ps(1.0);
127 ftwos = _mm_set1_ps(2.0);
128 ffours = _mm_set1_ps(4.0);
130 for(;number < quarterPoints; number++){
131 aVal = _mm_loadu_ps(aPtr);
132 aVal = _mm_div_ps(aVal, _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))));
134 condition = _mm_cmplt_ps(z, fzeroes);
135 z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
137 condition = _mm_cmplt_ps(z, fones);
138 x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
140 for(i = 0; i < 2; i++){
141 x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
143 x = _mm_div_ps(fones, x);
146 y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
149 y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
150 condition = _mm_cmpgt_ps(z, fones);
152 y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
154 condition = _mm_cmplt_ps(aVal, fzeroes);
155 arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), condition));
157 _mm_storeu_ps(bPtr, arcsine);
162 number = quarterPoints * 4;
163 for(;number < num_points; number++){
164 *bPtr++ = asin(*aPtr++);
170 #ifdef LV_HAVE_GENERIC
177 static inline void volk_32f_asin_32f_u_generic(
float* bVector,
const float* aVector,
unsigned int num_points){
178 float* bPtr = bVector;
179 const float* aPtr = aVector;
180 unsigned int number = 0;
182 for(number = 0; number < num_points; number++){
183 *bPtr++ = asin(*aPtr++);
#define ASIN_TERMS
Definition: volk_32f_asin_32f.h:28