23 #ifndef INCLUDED_volk_16i_32fc_dot_prod_32fc_H
24 #define INCLUDED_volk_16i_32fc_dot_prod_32fc_H
30 #ifdef LV_HAVE_GENERIC
32 static inline void volk_16i_32fc_dot_prod_32fc_generic(
lv_32fc_t* result,
const short* input,
const lv_32fc_t *
taps,
unsigned int num_points) {
34 static const int N_UNROLL = 4;
42 unsigned n = (num_points / N_UNROLL) * N_UNROLL;
44 for(i = 0; i < n; i += N_UNROLL) {
45 acc0 += taps[i + 0] * (float)input[i + 0];
46 acc1 += taps[i + 1] * (float)input[i + 1];
47 acc2 += taps[i + 2] * (float)input[i + 2];
48 acc3 += taps[i + 3] * (float)input[i + 3];
51 for(; i < num_points; i++) {
52 acc0 += taps[i] * (float)input[i];
55 *result = acc0 + acc1 + acc2 + acc3;
62 static inline void volk_16i_32fc_dot_prod_32fc_neon(
lv_32fc_t* result,
const short* input,
const lv_32fc_t * taps,
unsigned int num_points) {
65 unsigned quarter_points = num_points / 4;
67 short* inputPtr = (
short*) input;
70 float32x4x2_t tapsVal, accumulator_val;
73 float32x4_t input_float, prod_re, prod_im;
75 accumulator_val.val[0] = vdupq_n_f32(0.0);
76 accumulator_val.val[1] = vdupq_n_f32(0.0);
78 for(ii = 0; ii < quarter_points; ++ii) {
79 tapsVal = vld2q_f32((
float*)tapsPtr);
80 input16 = vld1_s16(inputPtr);
82 input32 = vmovl_s16(input16);
84 input_float = vcvtq_f32_s32(input32);
86 prod_re = vmulq_f32(input_float, tapsVal.val[0]);
87 prod_im = vmulq_f32(input_float, tapsVal.val[1]);
89 accumulator_val.val[0] = vaddq_f32(prod_re, accumulator_val.val[0]);
90 accumulator_val.val[1] = vaddq_f32(prod_im, accumulator_val.val[1]);
95 vst2q_f32((
float*)accumulator_vec, accumulator_val);
96 accumulator_vec[0] += accumulator_vec[1];
97 accumulator_vec[2] += accumulator_vec[3];
98 accumulator_vec[0] += accumulator_vec[2];
100 for(ii = quarter_points * 4; ii < num_points; ++ii) {
101 accumulator_vec[0] += *(tapsPtr++) * (
float)(*(inputPtr++));
104 *result = accumulator_vec[0];
109 #if LV_HAVE_SSE && LV_HAVE_MMX
111 static inline void volk_16i_32fc_dot_prod_32fc_u_sse(
lv_32fc_t* result,
const short* input,
const lv_32fc_t* taps,
unsigned int num_points) {
113 unsigned int number = 0;
114 const unsigned int sixteenthPoints = num_points / 8;
117 float *realpt = &res[0], *imagpt = &res[1];
118 const short* aPtr = input;
119 const float* bPtr = (
float*)taps;
122 __m128 f0, f1, f2, f3;
123 __m128 a0Val, a1Val, a2Val, a3Val;
124 __m128 b0Val, b1Val, b2Val, b3Val;
125 __m128 c0Val, c1Val, c2Val, c3Val;
127 __m128 dotProdVal0 = _mm_setzero_ps();
128 __m128 dotProdVal1 = _mm_setzero_ps();
129 __m128 dotProdVal2 = _mm_setzero_ps();
130 __m128 dotProdVal3 = _mm_setzero_ps();
132 for(;number < sixteenthPoints; number++){
134 m0 = _mm_set_pi16(*(aPtr+3), *(aPtr+2), *(aPtr+1), *(aPtr+0));
135 m1 = _mm_set_pi16(*(aPtr+7), *(aPtr+6), *(aPtr+5), *(aPtr+4));
136 f0 = _mm_cvtpi16_ps(m0);
137 f1 = _mm_cvtpi16_ps(m0);
138 f2 = _mm_cvtpi16_ps(m1);
139 f3 = _mm_cvtpi16_ps(m1);
141 a0Val = _mm_unpacklo_ps(f0, f1);
142 a1Val = _mm_unpackhi_ps(f0, f1);
143 a2Val = _mm_unpacklo_ps(f2, f3);
144 a3Val = _mm_unpackhi_ps(f2, f3);
146 b0Val = _mm_loadu_ps(bPtr);
147 b1Val = _mm_loadu_ps(bPtr+4);
148 b2Val = _mm_loadu_ps(bPtr+8);
149 b3Val = _mm_loadu_ps(bPtr+12);
151 c0Val = _mm_mul_ps(a0Val, b0Val);
152 c1Val = _mm_mul_ps(a1Val, b1Val);
153 c2Val = _mm_mul_ps(a2Val, b2Val);
154 c3Val = _mm_mul_ps(a3Val, b3Val);
156 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
157 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
158 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
159 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
165 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
166 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
167 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
171 _mm_store_ps(dotProductVector,dotProdVal0);
173 *realpt = dotProductVector[0];
174 *imagpt = dotProductVector[1];
175 *realpt += dotProductVector[2];
176 *imagpt += dotProductVector[3];
178 number = sixteenthPoints*8;
179 for(;number < num_points; number++){
180 *realpt += ((*aPtr) * (*bPtr++));
181 *imagpt += ((*aPtr++) * (*bPtr++));
192 #if LV_HAVE_SSE && LV_HAVE_MMX
195 static inline void volk_16i_32fc_dot_prod_32fc_a_sse(
lv_32fc_t* result,
const short* input,
const lv_32fc_t* taps,
unsigned int num_points) {
197 unsigned int number = 0;
198 const unsigned int sixteenthPoints = num_points / 8;
201 float *realpt = &res[0], *imagpt = &res[1];
202 const short* aPtr = input;
203 const float* bPtr = (
float*)taps;
206 __m128 f0, f1, f2, f3;
207 __m128 a0Val, a1Val, a2Val, a3Val;
208 __m128 b0Val, b1Val, b2Val, b3Val;
209 __m128 c0Val, c1Val, c2Val, c3Val;
211 __m128 dotProdVal0 = _mm_setzero_ps();
212 __m128 dotProdVal1 = _mm_setzero_ps();
213 __m128 dotProdVal2 = _mm_setzero_ps();
214 __m128 dotProdVal3 = _mm_setzero_ps();
216 for(;number < sixteenthPoints; number++){
218 m0 = _mm_set_pi16(*(aPtr+3), *(aPtr+2), *(aPtr+1), *(aPtr+0));
219 m1 = _mm_set_pi16(*(aPtr+7), *(aPtr+6), *(aPtr+5), *(aPtr+4));
220 f0 = _mm_cvtpi16_ps(m0);
221 f1 = _mm_cvtpi16_ps(m0);
222 f2 = _mm_cvtpi16_ps(m1);
223 f3 = _mm_cvtpi16_ps(m1);
225 a0Val = _mm_unpacklo_ps(f0, f1);
226 a1Val = _mm_unpackhi_ps(f0, f1);
227 a2Val = _mm_unpacklo_ps(f2, f3);
228 a3Val = _mm_unpackhi_ps(f2, f3);
230 b0Val = _mm_load_ps(bPtr);
231 b1Val = _mm_load_ps(bPtr+4);
232 b2Val = _mm_load_ps(bPtr+8);
233 b3Val = _mm_load_ps(bPtr+12);
235 c0Val = _mm_mul_ps(a0Val, b0Val);
236 c1Val = _mm_mul_ps(a1Val, b1Val);
237 c2Val = _mm_mul_ps(a2Val, b2Val);
238 c3Val = _mm_mul_ps(a3Val, b3Val);
240 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
241 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
242 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
243 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
249 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
250 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
251 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
255 _mm_store_ps(dotProductVector,dotProdVal0);
257 *realpt = dotProductVector[0];
258 *imagpt = dotProductVector[1];
259 *realpt += dotProductVector[2];
260 *imagpt += dotProductVector[3];
262 number = sixteenthPoints*8;
263 for(;number < num_points; number++){
264 *realpt += ((*aPtr) * (*bPtr++));
265 *imagpt += ((*aPtr++) * (*bPtr++));
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:27
static const float taps[NSTEPS+1][NTAPS]
Definition: interpolator_taps.h:9
float complex lv_32fc_t
Definition: volk_complex.h:56