23 #ifndef INCLUDED_volk_32f_x2_dot_prod_16i_H
24 #define INCLUDED_volk_32f_x2_dot_prod_16i_H
30 #ifdef LV_HAVE_GENERIC
33 static inline void volk_32f_x2_dot_prod_16i_generic(
int16_t* result,
const float* input,
const float*
taps,
unsigned int num_points) {
36 const float* aPtr = input;
37 const float* bPtr=
taps;
38 unsigned int number = 0;
40 for(number = 0; number < num_points; number++){
41 dotProduct += ((*aPtr++) * (*bPtr++));
52 static inline void volk_32f_x2_dot_prod_16i_a_sse(
int16_t* result,
const float* input,
const float* taps,
unsigned int num_points) {
54 unsigned int number = 0;
55 const unsigned int sixteenthPoints = num_points / 16;
58 const float* aPtr = input;
59 const float* bPtr =
taps;
61 __m128 a0Val, a1Val, a2Val, a3Val;
62 __m128 b0Val, b1Val, b2Val, b3Val;
63 __m128 c0Val, c1Val, c2Val, c3Val;
65 __m128 dotProdVal0 = _mm_setzero_ps();
66 __m128 dotProdVal1 = _mm_setzero_ps();
67 __m128 dotProdVal2 = _mm_setzero_ps();
68 __m128 dotProdVal3 = _mm_setzero_ps();
70 for(;number < sixteenthPoints; number++){
72 a0Val = _mm_load_ps(aPtr);
73 a1Val = _mm_load_ps(aPtr+4);
74 a2Val = _mm_load_ps(aPtr+8);
75 a3Val = _mm_load_ps(aPtr+12);
76 b0Val = _mm_load_ps(bPtr);
77 b1Val = _mm_load_ps(bPtr+4);
78 b2Val = _mm_load_ps(bPtr+8);
79 b3Val = _mm_load_ps(bPtr+12);
81 c0Val = _mm_mul_ps(a0Val, b0Val);
82 c1Val = _mm_mul_ps(a1Val, b1Val);
83 c2Val = _mm_mul_ps(a2Val, b2Val);
84 c3Val = _mm_mul_ps(a3Val, b3Val);
86 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
87 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
88 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
89 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
95 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
96 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
97 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
101 _mm_store_ps(dotProductVector,dotProdVal0);
103 dotProduct = dotProductVector[0];
104 dotProduct += dotProductVector[1];
105 dotProduct += dotProductVector[2];
106 dotProduct += dotProductVector[3];
108 number = sixteenthPoints*16;
109 for(;number < num_points; number++){
110 dotProduct += ((*aPtr++) * (*bPtr++));
113 *result = (short)dotProduct;
121 static inline void volk_32f_x2_dot_prod_16i_u_sse(
int16_t* result,
const float* input,
const float* taps,
unsigned int num_points) {
123 unsigned int number = 0;
124 const unsigned int sixteenthPoints = num_points / 16;
126 float dotProduct = 0;
127 const float* aPtr = input;
128 const float* bPtr =
taps;
130 __m128 a0Val, a1Val, a2Val, a3Val;
131 __m128 b0Val, b1Val, b2Val, b3Val;
132 __m128 c0Val, c1Val, c2Val, c3Val;
134 __m128 dotProdVal0 = _mm_setzero_ps();
135 __m128 dotProdVal1 = _mm_setzero_ps();
136 __m128 dotProdVal2 = _mm_setzero_ps();
137 __m128 dotProdVal3 = _mm_setzero_ps();
139 for(;number < sixteenthPoints; number++){
141 a0Val = _mm_loadu_ps(aPtr);
142 a1Val = _mm_loadu_ps(aPtr+4);
143 a2Val = _mm_loadu_ps(aPtr+8);
144 a3Val = _mm_loadu_ps(aPtr+12);
145 b0Val = _mm_loadu_ps(bPtr);
146 b1Val = _mm_loadu_ps(bPtr+4);
147 b2Val = _mm_loadu_ps(bPtr+8);
148 b3Val = _mm_loadu_ps(bPtr+12);
150 c0Val = _mm_mul_ps(a0Val, b0Val);
151 c1Val = _mm_mul_ps(a1Val, b1Val);
152 c2Val = _mm_mul_ps(a2Val, b2Val);
153 c3Val = _mm_mul_ps(a3Val, b3Val);
155 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
156 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
157 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
158 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
164 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
165 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
166 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
170 _mm_store_ps(dotProductVector,dotProdVal0);
172 dotProduct = dotProductVector[0];
173 dotProduct += dotProductVector[1];
174 dotProduct += dotProductVector[2];
175 dotProduct += dotProductVector[3];
177 number = sixteenthPoints*16;
178 for(;number < num_points; number++){
179 dotProduct += ((*aPtr++) * (*bPtr++));
182 *result = (short)dotProduct;
signed short int16_t
Definition: stdint.h:76
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:27
static const float taps[NSTEPS+1][NTAPS]
Definition: interpolator_taps.h:9