23 #ifndef INCLUDED_volk_32f_s32f_stddev_32f_a_H
24 #define INCLUDED_volk_32f_s32f_stddev_32f_a_H
32 #include <smmintrin.h>
40 static inline void volk_32f_s32f_stddev_32f_a_sse4_1(
float* stddev,
const float* inputBuffer,
const float mean,
unsigned int num_points){
41 float returnValue = 0;
43 unsigned int number = 0;
44 const unsigned int sixteenthPoints = num_points / 16;
46 const float* aPtr = inputBuffer;
50 __m128 squareAccumulator = _mm_setzero_ps();
51 __m128 aVal1, aVal2, aVal3, aVal4;
52 __m128 cVal1, cVal2, cVal3, cVal4;
53 for(;number < sixteenthPoints; number++) {
54 aVal1 = _mm_load_ps(aPtr); aPtr += 4;
55 cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1);
57 aVal2 = _mm_load_ps(aPtr); aPtr += 4;
58 cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2);
60 aVal3 = _mm_load_ps(aPtr); aPtr += 4;
61 cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4);
63 aVal4 = _mm_load_ps(aPtr); aPtr += 4;
64 cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8);
66 cVal1 = _mm_or_ps(cVal1, cVal2);
67 cVal3 = _mm_or_ps(cVal3, cVal4);
68 cVal1 = _mm_or_ps(cVal1, cVal3);
70 squareAccumulator = _mm_add_ps(squareAccumulator, cVal1);
72 _mm_store_ps(squareBuffer,squareAccumulator);
73 returnValue = squareBuffer[0];
74 returnValue += squareBuffer[1];
75 returnValue += squareBuffer[2];
76 returnValue += squareBuffer[3];
78 number = sixteenthPoints * 16;
79 for(;number < num_points; number++){
80 returnValue += (*aPtr) * (*aPtr);
83 returnValue /= num_points;
84 returnValue -= (mean * mean);
85 returnValue = sqrtf(returnValue);
87 *stddev = returnValue;
92 #include <xmmintrin.h>
100 static inline void volk_32f_s32f_stddev_32f_a_sse(
float* stddev,
const float* inputBuffer,
const float mean,
unsigned int num_points){
101 float returnValue = 0;
103 unsigned int number = 0;
104 const unsigned int quarterPoints = num_points / 4;
106 const float* aPtr = inputBuffer;
110 __m128 squareAccumulator = _mm_setzero_ps();
111 __m128 aVal = _mm_setzero_ps();
112 for(;number < quarterPoints; number++) {
113 aVal = _mm_load_ps(aPtr);
114 aVal = _mm_mul_ps(aVal, aVal);
115 squareAccumulator = _mm_add_ps(squareAccumulator, aVal);
118 _mm_store_ps(squareBuffer,squareAccumulator);
119 returnValue = squareBuffer[0];
120 returnValue += squareBuffer[1];
121 returnValue += squareBuffer[2];
122 returnValue += squareBuffer[3];
124 number = quarterPoints * 4;
125 for(;number < num_points; number++){
126 returnValue += (*aPtr) * (*aPtr);
129 returnValue /= num_points;
130 returnValue -= (mean * mean);
131 returnValue = sqrtf(returnValue);
133 *stddev = returnValue;
137 #ifdef LV_HAVE_GENERIC
145 static inline void volk_32f_s32f_stddev_32f_generic(
float* stddev,
const float* inputBuffer,
const float mean,
unsigned int num_points){
146 float returnValue = 0;
148 const float* aPtr = inputBuffer;
149 unsigned int number = 0;
151 for(number = 0; number < num_points; number++){
152 returnValue += (*aPtr) * (*aPtr);
156 returnValue /= num_points;
157 returnValue -= (mean * mean);
158 returnValue = sqrtf(returnValue);
160 *stddev = returnValue;
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:27