23 #ifndef INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
24 #define INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
30 #ifdef LV_HAVE_GENERIC
33 static inline void volk_32fc_32f_dot_prod_32fc_generic(
lv_32fc_t* result,
const lv_32fc_t* input,
const float *
taps,
unsigned int num_points) {
36 float *realpt = &res[0], *imagpt = &res[1];
37 const float* aPtr = (
float*)input;
38 const float* bPtr=
taps;
39 unsigned int number = 0;
44 for(number = 0; number < num_points; number++){
45 *realpt += ((*aPtr++) * (*bPtr));
46 *imagpt += ((*aPtr++) * (*bPtr++));
57 #include <immintrin.h>
59 static inline void volk_32fc_32f_dot_prod_32fc_a_avx(
lv_32fc_t* result,
const lv_32fc_t* input,
const float* taps,
unsigned int num_points) {
61 unsigned int number = 0;
62 const unsigned int sixteenthPoints = num_points / 16;
65 float *realpt = &res[0], *imagpt = &res[1];
66 const float* aPtr = (
float*)input;
67 const float* bPtr =
taps;
69 __m256 a0Val, a1Val, a2Val, a3Val;
70 __m256 b0Val, b1Val, b2Val, b3Val;
71 __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
72 __m256 c0Val, c1Val, c2Val, c3Val;
74 __m256 dotProdVal0 = _mm256_setzero_ps();
75 __m256 dotProdVal1 = _mm256_setzero_ps();
76 __m256 dotProdVal2 = _mm256_setzero_ps();
77 __m256 dotProdVal3 = _mm256_setzero_ps();
79 for(;number < sixteenthPoints; number++){
81 a0Val = _mm256_load_ps(aPtr);
82 a1Val = _mm256_load_ps(aPtr+8);
83 a2Val = _mm256_load_ps(aPtr+16);
84 a3Val = _mm256_load_ps(aPtr+24);
86 x0Val = _mm256_load_ps(bPtr);
87 x1Val = _mm256_load_ps(bPtr+8);
88 x0loVal = _mm256_unpacklo_ps(x0Val, x0Val);
89 x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val);
90 x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
91 x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
94 b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20);
95 b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31);
96 b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
97 b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
99 c0Val = _mm256_mul_ps(a0Val, b0Val);
100 c1Val = _mm256_mul_ps(a1Val, b1Val);
101 c2Val = _mm256_mul_ps(a2Val, b2Val);
102 c3Val = _mm256_mul_ps(a3Val, b3Val);
104 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
105 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
106 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
107 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
113 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
114 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
115 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
119 _mm256_store_ps(dotProductVector,dotProdVal0);
121 *realpt = dotProductVector[0];
122 *imagpt = dotProductVector[1];
123 *realpt += dotProductVector[2];
124 *imagpt += dotProductVector[3];
125 *realpt += dotProductVector[4];
126 *imagpt += dotProductVector[5];
127 *realpt += dotProductVector[6];
128 *imagpt += dotProductVector[7];
130 number = sixteenthPoints*16;
131 for(;number < num_points; number++){
132 *realpt += ((*aPtr++) * (*bPtr));
133 *imagpt += ((*aPtr++) * (*bPtr++));
147 static inline void volk_32fc_32f_dot_prod_32fc_a_sse(
lv_32fc_t* result,
const lv_32fc_t* input,
const float* taps,
unsigned int num_points) {
149 unsigned int number = 0;
150 const unsigned int sixteenthPoints = num_points / 8;
153 float *realpt = &res[0], *imagpt = &res[1];
154 const float* aPtr = (
float*)input;
155 const float* bPtr =
taps;
157 __m128 a0Val, a1Val, a2Val, a3Val;
158 __m128 b0Val, b1Val, b2Val, b3Val;
159 __m128 x0Val, x1Val, x2Val, x3Val;
160 __m128 c0Val, c1Val, c2Val, c3Val;
162 __m128 dotProdVal0 = _mm_setzero_ps();
163 __m128 dotProdVal1 = _mm_setzero_ps();
164 __m128 dotProdVal2 = _mm_setzero_ps();
165 __m128 dotProdVal3 = _mm_setzero_ps();
167 for(;number < sixteenthPoints; number++){
169 a0Val = _mm_load_ps(aPtr);
170 a1Val = _mm_load_ps(aPtr+4);
171 a2Val = _mm_load_ps(aPtr+8);
172 a3Val = _mm_load_ps(aPtr+12);
174 x0Val = _mm_load_ps(bPtr);
175 x1Val = _mm_load_ps(bPtr);
176 x2Val = _mm_load_ps(bPtr+4);
177 x3Val = _mm_load_ps(bPtr+4);
178 b0Val = _mm_unpacklo_ps(x0Val, x1Val);
179 b1Val = _mm_unpackhi_ps(x0Val, x1Val);
180 b2Val = _mm_unpacklo_ps(x2Val, x3Val);
181 b3Val = _mm_unpackhi_ps(x2Val, x3Val);
183 c0Val = _mm_mul_ps(a0Val, b0Val);
184 c1Val = _mm_mul_ps(a1Val, b1Val);
185 c2Val = _mm_mul_ps(a2Val, b2Val);
186 c3Val = _mm_mul_ps(a3Val, b3Val);
188 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
189 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
190 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
191 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
197 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
198 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
199 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
203 _mm_store_ps(dotProductVector,dotProdVal0);
205 *realpt = dotProductVector[0];
206 *imagpt = dotProductVector[1];
207 *realpt += dotProductVector[2];
208 *imagpt += dotProductVector[3];
210 number = sixteenthPoints*8;
211 for(;number < num_points; number++){
212 *realpt += ((*aPtr++) * (*bPtr));
213 *imagpt += ((*aPtr++) * (*bPtr++));
225 #include <immintrin.h>
227 static inline void volk_32fc_32f_dot_prod_32fc_u_avx(
lv_32fc_t* result,
const lv_32fc_t* input,
const float* taps,
unsigned int num_points) {
229 unsigned int number = 0;
230 const unsigned int sixteenthPoints = num_points / 16;
233 float *realpt = &res[0], *imagpt = &res[1];
234 const float* aPtr = (
float*)input;
235 const float* bPtr =
taps;
237 __m256 a0Val, a1Val, a2Val, a3Val;
238 __m256 b0Val, b1Val, b2Val, b3Val;
239 __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
240 __m256 c0Val, c1Val, c2Val, c3Val;
242 __m256 dotProdVal0 = _mm256_setzero_ps();
243 __m256 dotProdVal1 = _mm256_setzero_ps();
244 __m256 dotProdVal2 = _mm256_setzero_ps();
245 __m256 dotProdVal3 = _mm256_setzero_ps();
247 for(;number < sixteenthPoints; number++){
249 a0Val = _mm256_loadu_ps(aPtr);
250 a1Val = _mm256_loadu_ps(aPtr+8);
251 a2Val = _mm256_loadu_ps(aPtr+16);
252 a3Val = _mm256_loadu_ps(aPtr+24);
254 x0Val = _mm256_loadu_ps(bPtr);
255 x1Val = _mm256_loadu_ps(bPtr+8);
256 x0loVal = _mm256_unpacklo_ps(x0Val, x0Val);
257 x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val);
258 x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
259 x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
262 b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20);
263 b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31);
264 b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
265 b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
267 c0Val = _mm256_mul_ps(a0Val, b0Val);
268 c1Val = _mm256_mul_ps(a1Val, b1Val);
269 c2Val = _mm256_mul_ps(a2Val, b2Val);
270 c3Val = _mm256_mul_ps(a3Val, b3Val);
272 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
273 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
274 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
275 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
281 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
282 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
283 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
287 _mm256_store_ps(dotProductVector,dotProdVal0);
289 *realpt = dotProductVector[0];
290 *imagpt = dotProductVector[1];
291 *realpt += dotProductVector[2];
292 *imagpt += dotProductVector[3];
293 *realpt += dotProductVector[4];
294 *imagpt += dotProductVector[5];
295 *realpt += dotProductVector[6];
296 *imagpt += dotProductVector[7];
298 number = sixteenthPoints*16;
299 for(;number < num_points; number++){
300 *realpt += ((*aPtr++) * (*bPtr));
301 *imagpt += ((*aPtr++) * (*bPtr++));
309 #include <arm_neon.h>
311 static inline void volk_32fc_32f_dot_prod_32fc_neon_unroll (
lv_32fc_t* __restrict result,
const lv_32fc_t* __restrict input,
const float* __restrict taps,
unsigned int num_points) {
314 const unsigned int quarterPoints = num_points / 8;
317 float *realpt = &res[0], *imagpt = &res[1];
318 const float* inputPtr = (
float*)input;
319 const float* tapsPtr =
taps;
320 float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
321 float accVector_real[4];
322 float accVector_imag[4];
324 float32x4x2_t inputVector0, inputVector1;
325 float32x4_t tapsVector0, tapsVector1;
326 float32x4_t tmp_real0, tmp_imag0;
327 float32x4_t tmp_real1, tmp_imag1;
328 float32x4_t real_accumulator0, imag_accumulator0;
329 float32x4_t real_accumulator1, imag_accumulator1;
333 real_accumulator0 = vld1q_f32( zero );
334 imag_accumulator0 = vld1q_f32( zero );
335 real_accumulator1 = vld1q_f32( zero );
336 imag_accumulator1 = vld1q_f32( zero );
338 for(number=0 ;number < quarterPoints; number++){
340 tapsVector0 = vld1q_f32(tapsPtr );
341 tapsVector1 = vld1q_f32(tapsPtr+4 );
344 inputVector0 = vld2q_f32(inputPtr );
345 inputVector1 = vld2q_f32(inputPtr+8 );
348 tmp_real0 = vmulq_f32(tapsVector0, inputVector0.val[0]);
349 tmp_imag0 = vmulq_f32(tapsVector0, inputVector0.val[1]);
351 tmp_real1 = vmulq_f32(tapsVector1, inputVector1.val[0]);
352 tmp_imag1 = vmulq_f32(tapsVector1, inputVector1.val[1]);
354 real_accumulator0 = vaddq_f32(real_accumulator0, tmp_real0);
355 imag_accumulator0 = vaddq_f32(imag_accumulator0, tmp_imag0);
357 real_accumulator1 = vaddq_f32(real_accumulator1, tmp_real1);
358 imag_accumulator1 = vaddq_f32(imag_accumulator1, tmp_imag1);
364 real_accumulator0 = vaddq_f32( real_accumulator0, real_accumulator1);
365 imag_accumulator0 = vaddq_f32( imag_accumulator0, imag_accumulator1);
368 vst1q_f32(accVector_real, real_accumulator0);
369 vst1q_f32(accVector_imag, imag_accumulator0);
370 *realpt = accVector_real[0] + accVector_real[1] +
371 accVector_real[2] + accVector_real[3] ;
373 *imagpt = accVector_imag[0] + accVector_imag[1] +
374 accVector_imag[2] + accVector_imag[3] ;
377 for(number=quarterPoints*8; number < num_points; number++){
378 *realpt += ((*inputPtr++) * (*tapsPtr));
379 *imagpt += ((*inputPtr++) * (*tapsPtr++));
388 #include <arm_neon.h>
390 static inline void volk_32fc_32f_dot_prod_32fc_a_neon (
lv_32fc_t* __restrict result,
const lv_32fc_t* __restrict input,
const float* __restrict taps,
unsigned int num_points) {
393 const unsigned int quarterPoints = num_points / 4;
396 float *realpt = &res[0], *imagpt = &res[1];
397 const float* inputPtr = (
float*)input;
398 const float* tapsPtr =
taps;
399 float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
400 float accVector_real[4];
401 float accVector_imag[4];
403 float32x4x2_t inputVector;
404 float32x4_t tapsVector;
405 float32x4_t tmp_real, tmp_imag;
406 float32x4_t real_accumulator, imag_accumulator;
411 real_accumulator = vld1q_f32( zero );
412 imag_accumulator = vld1q_f32( zero );
414 for(number=0 ;number < quarterPoints; number++){
417 tapsVector = vld1q_f32(tapsPtr );
420 inputVector = vld2q_f32(inputPtr );
422 tmp_real = vmulq_f32(tapsVector, inputVector.val[0]);
423 tmp_imag = vmulq_f32(tapsVector, inputVector.val[1]);
425 real_accumulator = vaddq_f32(real_accumulator, tmp_real);
426 imag_accumulator = vaddq_f32(imag_accumulator, tmp_imag);
435 vst1q_f32(accVector_real, real_accumulator);
436 vst1q_f32(accVector_imag, imag_accumulator);
437 *realpt = accVector_real[0] + accVector_real[1] +
438 accVector_real[2] + accVector_real[3] ;
440 *imagpt = accVector_imag[0] + accVector_imag[1] +
441 accVector_imag[2] + accVector_imag[3] ;
444 for(number=quarterPoints*4; number < num_points; number++){
445 *realpt += ((*inputPtr++) * (*tapsPtr));
446 *imagpt += ((*inputPtr++) * (*tapsPtr++));
455 extern void volk_32fc_32f_dot_prod_32fc_a_neonasm (
lv_32fc_t* result,
const lv_32fc_t* input,
const float* taps,
unsigned int num_points);
459 extern void volk_32fc_32f_dot_prod_32fc_a_neonpipeline (
lv_32fc_t* result,
const lv_32fc_t* input,
const float* taps,
unsigned int num_points);
464 static inline void volk_32fc_32f_dot_prod_32fc_u_sse(
lv_32fc_t* result,
const lv_32fc_t* input,
const float* taps,
unsigned int num_points) {
466 unsigned int number = 0;
467 const unsigned int sixteenthPoints = num_points / 8;
470 float *realpt = &res[0], *imagpt = &res[1];
471 const float* aPtr = (
float*)input;
472 const float* bPtr =
taps;
474 __m128 a0Val, a1Val, a2Val, a3Val;
475 __m128 b0Val, b1Val, b2Val, b3Val;
476 __m128 x0Val, x1Val, x2Val, x3Val;
477 __m128 c0Val, c1Val, c2Val, c3Val;
479 __m128 dotProdVal0 = _mm_setzero_ps();
480 __m128 dotProdVal1 = _mm_setzero_ps();
481 __m128 dotProdVal2 = _mm_setzero_ps();
482 __m128 dotProdVal3 = _mm_setzero_ps();
484 for(;number < sixteenthPoints; number++){
486 a0Val = _mm_loadu_ps(aPtr);
487 a1Val = _mm_loadu_ps(aPtr+4);
488 a2Val = _mm_loadu_ps(aPtr+8);
489 a3Val = _mm_loadu_ps(aPtr+12);
491 x0Val = _mm_loadu_ps(bPtr);
492 x1Val = _mm_loadu_ps(bPtr);
493 x2Val = _mm_loadu_ps(bPtr+4);
494 x3Val = _mm_loadu_ps(bPtr+4);
495 b0Val = _mm_unpacklo_ps(x0Val, x1Val);
496 b1Val = _mm_unpackhi_ps(x0Val, x1Val);
497 b2Val = _mm_unpacklo_ps(x2Val, x3Val);
498 b3Val = _mm_unpackhi_ps(x2Val, x3Val);
500 c0Val = _mm_mul_ps(a0Val, b0Val);
501 c1Val = _mm_mul_ps(a1Val, b1Val);
502 c2Val = _mm_mul_ps(a2Val, b2Val);
503 c3Val = _mm_mul_ps(a3Val, b3Val);
505 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
506 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
507 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
508 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
514 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
515 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
516 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
520 _mm_store_ps(dotProductVector,dotProdVal0);
522 *realpt = dotProductVector[0];
523 *imagpt = dotProductVector[1];
524 *realpt += dotProductVector[2];
525 *imagpt += dotProductVector[3];
527 number = sixteenthPoints*8;
528 for(;number < num_points; number++){
529 *realpt += ((*aPtr++) * (*bPtr));
530 *imagpt += ((*aPtr++) * (*bPtr++));
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:27
static const float taps[NSTEPS+1][NTAPS]
Definition: interpolator_taps.h:9
float complex lv_32fc_t
Definition: volk_complex.h:56