23 #ifndef INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H
24 #define INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H
32 #include <smmintrin.h>
40 static inline void volk_32f_stddev_and_mean_32f_x2_a_sse4_1(
float* stddev,
float* mean,
const float* inputBuffer,
unsigned int num_points){
41 float returnValue = 0;
44 unsigned int number = 0;
45 const unsigned int sixteenthPoints = num_points / 16;
47 const float* aPtr = inputBuffer;
51 __m128 accumulator = _mm_setzero_ps();
52 __m128 squareAccumulator = _mm_setzero_ps();
53 __m128 aVal1, aVal2, aVal3, aVal4;
54 __m128 cVal1, cVal2, cVal3, cVal4;
55 for(;number < sixteenthPoints; number++) {
56 aVal1 = _mm_load_ps(aPtr); aPtr += 4;
57 cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1);
58 accumulator = _mm_add_ps(accumulator, aVal1);
60 aVal2 = _mm_load_ps(aPtr); aPtr += 4;
61 cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2);
62 accumulator = _mm_add_ps(accumulator, aVal2);
64 aVal3 = _mm_load_ps(aPtr); aPtr += 4;
65 cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4);
66 accumulator = _mm_add_ps(accumulator, aVal3);
68 aVal4 = _mm_load_ps(aPtr); aPtr += 4;
69 cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8);
70 accumulator = _mm_add_ps(accumulator, aVal4);
72 cVal1 = _mm_or_ps(cVal1, cVal2);
73 cVal3 = _mm_or_ps(cVal3, cVal4);
74 cVal1 = _mm_or_ps(cVal1, cVal3);
76 squareAccumulator = _mm_add_ps(squareAccumulator, cVal1);
78 _mm_store_ps(meanBuffer,accumulator);
79 _mm_store_ps(squareBuffer,squareAccumulator);
80 newMean = meanBuffer[0];
81 newMean += meanBuffer[1];
82 newMean += meanBuffer[2];
83 newMean += meanBuffer[3];
84 returnValue = squareBuffer[0];
85 returnValue += squareBuffer[1];
86 returnValue += squareBuffer[2];
87 returnValue += squareBuffer[3];
89 number = sixteenthPoints * 16;
90 for(;number < num_points; number++){
91 returnValue += (*aPtr) * (*aPtr);
94 newMean /= num_points;
95 returnValue /= num_points;
96 returnValue -= (newMean * newMean);
97 returnValue = sqrtf(returnValue);
99 *stddev = returnValue;
105 #include <xmmintrin.h>
113 static inline void volk_32f_stddev_and_mean_32f_x2_a_sse(
float* stddev,
float* mean,
const float* inputBuffer,
unsigned int num_points){
114 float returnValue = 0;
117 unsigned int number = 0;
118 const unsigned int quarterPoints = num_points / 4;
120 const float* aPtr = inputBuffer;
124 __m128 accumulator = _mm_setzero_ps();
125 __m128 squareAccumulator = _mm_setzero_ps();
126 __m128 aVal = _mm_setzero_ps();
127 for(;number < quarterPoints; number++) {
128 aVal = _mm_load_ps(aPtr);
129 accumulator = _mm_add_ps(accumulator, aVal);
130 aVal = _mm_mul_ps(aVal, aVal);
131 squareAccumulator = _mm_add_ps(squareAccumulator, aVal);
134 _mm_store_ps(meanBuffer,accumulator);
135 _mm_store_ps(squareBuffer,squareAccumulator);
136 newMean = meanBuffer[0];
137 newMean += meanBuffer[1];
138 newMean += meanBuffer[2];
139 newMean += meanBuffer[3];
140 returnValue = squareBuffer[0];
141 returnValue += squareBuffer[1];
142 returnValue += squareBuffer[2];
143 returnValue += squareBuffer[3];
145 number = quarterPoints * 4;
146 for(;number < num_points; number++){
147 returnValue += (*aPtr) * (*aPtr);
150 newMean /= num_points;
151 returnValue /= num_points;
152 returnValue -= (newMean * newMean);
153 returnValue = sqrtf(returnValue);
155 *stddev = returnValue;
160 #ifdef LV_HAVE_GENERIC
168 static inline void volk_32f_stddev_and_mean_32f_x2_generic(
float* stddev,
float* mean,
const float* inputBuffer,
unsigned int num_points){
169 float returnValue = 0;
172 const float* aPtr = inputBuffer;
173 unsigned int number = 0;
175 for(number = 0; number < num_points; number++){
176 returnValue += (*aPtr) * (*aPtr);
179 newMean /= num_points;
180 returnValue /= num_points;
181 returnValue -= (newMean * newMean);
182 returnValue = sqrtf(returnValue);
184 *stddev = returnValue;
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:27