23 #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H
24 #define INCLUDED_volk_32f_x2_dot_prod_32f_u_H
30 #ifdef LV_HAVE_GENERIC
33 static inline void volk_32f_x2_dot_prod_32f_generic(
float * result,
const float * input,
const float *
taps,
unsigned int num_points) {
36 const float* aPtr = input;
37 const float* bPtr=
taps;
38 unsigned int number = 0;
40 for(number = 0; number < num_points; number++){
41 dotProduct += ((*aPtr++) * (*bPtr++));
53 static inline void volk_32f_x2_dot_prod_32f_u_sse(
float* result,
const float* input,
const float* taps,
unsigned int num_points) {
55 unsigned int number = 0;
56 const unsigned int sixteenthPoints = num_points / 16;
59 const float* aPtr = input;
60 const float* bPtr =
taps;
62 __m128 a0Val, a1Val, a2Val, a3Val;
63 __m128 b0Val, b1Val, b2Val, b3Val;
64 __m128 c0Val, c1Val, c2Val, c3Val;
66 __m128 dotProdVal0 = _mm_setzero_ps();
67 __m128 dotProdVal1 = _mm_setzero_ps();
68 __m128 dotProdVal2 = _mm_setzero_ps();
69 __m128 dotProdVal3 = _mm_setzero_ps();
71 for(;number < sixteenthPoints; number++){
73 a0Val = _mm_loadu_ps(aPtr);
74 a1Val = _mm_loadu_ps(aPtr+4);
75 a2Val = _mm_loadu_ps(aPtr+8);
76 a3Val = _mm_loadu_ps(aPtr+12);
77 b0Val = _mm_loadu_ps(bPtr);
78 b1Val = _mm_loadu_ps(bPtr+4);
79 b2Val = _mm_loadu_ps(bPtr+8);
80 b3Val = _mm_loadu_ps(bPtr+12);
82 c0Val = _mm_mul_ps(a0Val, b0Val);
83 c1Val = _mm_mul_ps(a1Val, b1Val);
84 c2Val = _mm_mul_ps(a2Val, b2Val);
85 c3Val = _mm_mul_ps(a3Val, b3Val);
87 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
88 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
89 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
90 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
96 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
97 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
98 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
102 _mm_store_ps(dotProductVector,dotProdVal0);
104 dotProduct = dotProductVector[0];
105 dotProduct += dotProductVector[1];
106 dotProduct += dotProductVector[2];
107 dotProduct += dotProductVector[3];
109 number = sixteenthPoints*16;
110 for(;number < num_points; number++){
111 dotProduct += ((*aPtr++) * (*bPtr++));
114 *result = dotProduct;
122 #include <pmmintrin.h>
124 static inline void volk_32f_x2_dot_prod_32f_u_sse3(
float * result,
const float * input,
const float * taps,
unsigned int num_points) {
125 unsigned int number = 0;
126 const unsigned int sixteenthPoints = num_points / 16;
128 float dotProduct = 0;
129 const float* aPtr = input;
130 const float* bPtr =
taps;
132 __m128 a0Val, a1Val, a2Val, a3Val;
133 __m128 b0Val, b1Val, b2Val, b3Val;
134 __m128 c0Val, c1Val, c2Val, c3Val;
136 __m128 dotProdVal0 = _mm_setzero_ps();
137 __m128 dotProdVal1 = _mm_setzero_ps();
138 __m128 dotProdVal2 = _mm_setzero_ps();
139 __m128 dotProdVal3 = _mm_setzero_ps();
141 for(;number < sixteenthPoints; number++){
143 a0Val = _mm_loadu_ps(aPtr);
144 a1Val = _mm_loadu_ps(aPtr+4);
145 a2Val = _mm_loadu_ps(aPtr+8);
146 a3Val = _mm_loadu_ps(aPtr+12);
147 b0Val = _mm_loadu_ps(bPtr);
148 b1Val = _mm_loadu_ps(bPtr+4);
149 b2Val = _mm_loadu_ps(bPtr+8);
150 b3Val = _mm_loadu_ps(bPtr+12);
152 c0Val = _mm_mul_ps(a0Val, b0Val);
153 c1Val = _mm_mul_ps(a1Val, b1Val);
154 c2Val = _mm_mul_ps(a2Val, b2Val);
155 c3Val = _mm_mul_ps(a3Val, b3Val);
157 dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
158 dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
159 dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
160 dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
166 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
167 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
168 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
171 _mm_store_ps(dotProductVector,dotProdVal0);
173 dotProduct = dotProductVector[0];
174 dotProduct += dotProductVector[1];
175 dotProduct += dotProductVector[2];
176 dotProduct += dotProductVector[3];
178 number = sixteenthPoints*16;
179 for(;number < num_points; number++){
180 dotProduct += ((*aPtr++) * (*bPtr++));
183 *result = dotProduct;
188 #ifdef LV_HAVE_SSE4_1
190 #include <smmintrin.h>
192 static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(
float * result,
const float * input,
const float* taps,
unsigned int num_points) {
193 unsigned int number = 0;
194 const unsigned int sixteenthPoints = num_points / 16;
196 float dotProduct = 0;
197 const float* aPtr = input;
198 const float* bPtr =
taps;
200 __m128 aVal1, bVal1, cVal1;
201 __m128 aVal2, bVal2, cVal2;
202 __m128 aVal3, bVal3, cVal3;
203 __m128 aVal4, bVal4, cVal4;
205 __m128 dotProdVal = _mm_setzero_ps();
207 for(;number < sixteenthPoints; number++){
209 aVal1 = _mm_loadu_ps(aPtr); aPtr += 4;
210 aVal2 = _mm_loadu_ps(aPtr); aPtr += 4;
211 aVal3 = _mm_loadu_ps(aPtr); aPtr += 4;
212 aVal4 = _mm_loadu_ps(aPtr); aPtr += 4;
214 bVal1 = _mm_loadu_ps(bPtr); bPtr += 4;
215 bVal2 = _mm_loadu_ps(bPtr); bPtr += 4;
216 bVal3 = _mm_loadu_ps(bPtr); bPtr += 4;
217 bVal4 = _mm_loadu_ps(bPtr); bPtr += 4;
219 cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
220 cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
221 cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
222 cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
224 cVal1 = _mm_or_ps(cVal1, cVal2);
225 cVal3 = _mm_or_ps(cVal3, cVal4);
226 cVal1 = _mm_or_ps(cVal1, cVal3);
228 dotProdVal = _mm_add_ps(dotProdVal, cVal1);
232 _mm_store_ps(dotProductVector, dotProdVal);
234 dotProduct = dotProductVector[0];
235 dotProduct += dotProductVector[1];
236 dotProduct += dotProductVector[2];
237 dotProduct += dotProductVector[3];
239 number = sixteenthPoints * 16;
240 for(;number < num_points; number++){
241 dotProduct += ((*aPtr++) * (*bPtr++));
244 *result = dotProduct;
251 #include <immintrin.h>
253 static inline void volk_32f_x2_dot_prod_32f_u_avx(
float* result,
const float* input,
const float* taps,
unsigned int num_points) {
255 unsigned int number = 0;
256 const unsigned int sixteenthPoints = num_points / 16;
258 float dotProduct = 0;
259 const float* aPtr = input;
260 const float* bPtr =
taps;
266 __m256 dotProdVal0 = _mm256_setzero_ps();
267 __m256 dotProdVal1 = _mm256_setzero_ps();
269 for(;number < sixteenthPoints; number++){
271 a0Val = _mm256_loadu_ps(aPtr);
272 a1Val = _mm256_loadu_ps(aPtr+8);
273 b0Val = _mm256_loadu_ps(bPtr);
274 b1Val = _mm256_loadu_ps(bPtr+8);
276 c0Val = _mm256_mul_ps(a0Val, b0Val);
277 c1Val = _mm256_mul_ps(a1Val, b1Val);
279 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
280 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
286 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
290 _mm256_storeu_ps(dotProductVector,dotProdVal0);
292 dotProduct = dotProductVector[0];
293 dotProduct += dotProductVector[1];
294 dotProduct += dotProductVector[2];
295 dotProduct += dotProductVector[3];
296 dotProduct += dotProductVector[4];
297 dotProduct += dotProductVector[5];
298 dotProduct += dotProductVector[6];
299 dotProduct += dotProductVector[7];
301 number = sixteenthPoints*16;
302 for(;number < num_points; number++){
303 dotProduct += ((*aPtr++) * (*bPtr++));
306 *result = dotProduct;
313 #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a_H
314 #define INCLUDED_volk_32f_x2_dot_prod_32f_a_H
320 #ifdef LV_HAVE_GENERIC
323 static inline void volk_32f_x2_dot_prod_32f_a_generic(
float * result,
const float * input,
const float * taps,
unsigned int num_points) {
325 float dotProduct = 0;
326 const float* aPtr = input;
327 const float* bPtr=
taps;
328 unsigned int number = 0;
330 for(number = 0; number < num_points; number++){
331 dotProduct += ((*aPtr++) * (*bPtr++));
334 *result = dotProduct;
343 static inline void volk_32f_x2_dot_prod_32f_a_sse(
float* result,
const float* input,
const float* taps,
unsigned int num_points) {
345 unsigned int number = 0;
346 const unsigned int sixteenthPoints = num_points / 16;
348 float dotProduct = 0;
349 const float* aPtr = input;
350 const float* bPtr =
taps;
352 __m128 a0Val, a1Val, a2Val, a3Val;
353 __m128 b0Val, b1Val, b2Val, b3Val;
354 __m128 c0Val, c1Val, c2Val, c3Val;
356 __m128 dotProdVal0 = _mm_setzero_ps();
357 __m128 dotProdVal1 = _mm_setzero_ps();
358 __m128 dotProdVal2 = _mm_setzero_ps();
359 __m128 dotProdVal3 = _mm_setzero_ps();
361 for(;number < sixteenthPoints; number++){
363 a0Val = _mm_load_ps(aPtr);
364 a1Val = _mm_load_ps(aPtr+4);
365 a2Val = _mm_load_ps(aPtr+8);
366 a3Val = _mm_load_ps(aPtr+12);
367 b0Val = _mm_load_ps(bPtr);
368 b1Val = _mm_load_ps(bPtr+4);
369 b2Val = _mm_load_ps(bPtr+8);
370 b3Val = _mm_load_ps(bPtr+12);
372 c0Val = _mm_mul_ps(a0Val, b0Val);
373 c1Val = _mm_mul_ps(a1Val, b1Val);
374 c2Val = _mm_mul_ps(a2Val, b2Val);
375 c3Val = _mm_mul_ps(a3Val, b3Val);
377 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
378 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
379 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
380 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
386 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
387 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
388 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
392 _mm_store_ps(dotProductVector,dotProdVal0);
394 dotProduct = dotProductVector[0];
395 dotProduct += dotProductVector[1];
396 dotProduct += dotProductVector[2];
397 dotProduct += dotProductVector[3];
399 number = sixteenthPoints*16;
400 for(;number < num_points; number++){
401 dotProduct += ((*aPtr++) * (*bPtr++));
404 *result = dotProduct;
412 #include <pmmintrin.h>
414 static inline void volk_32f_x2_dot_prod_32f_a_sse3(
float * result,
const float * input,
const float * taps,
unsigned int num_points) {
415 unsigned int number = 0;
416 const unsigned int sixteenthPoints = num_points / 16;
418 float dotProduct = 0;
419 const float* aPtr = input;
420 const float* bPtr =
taps;
422 __m128 a0Val, a1Val, a2Val, a3Val;
423 __m128 b0Val, b1Val, b2Val, b3Val;
424 __m128 c0Val, c1Val, c2Val, c3Val;
426 __m128 dotProdVal0 = _mm_setzero_ps();
427 __m128 dotProdVal1 = _mm_setzero_ps();
428 __m128 dotProdVal2 = _mm_setzero_ps();
429 __m128 dotProdVal3 = _mm_setzero_ps();
431 for(;number < sixteenthPoints; number++){
433 a0Val = _mm_load_ps(aPtr);
434 a1Val = _mm_load_ps(aPtr+4);
435 a2Val = _mm_load_ps(aPtr+8);
436 a3Val = _mm_load_ps(aPtr+12);
437 b0Val = _mm_load_ps(bPtr);
438 b1Val = _mm_load_ps(bPtr+4);
439 b2Val = _mm_load_ps(bPtr+8);
440 b3Val = _mm_load_ps(bPtr+12);
442 c0Val = _mm_mul_ps(a0Val, b0Val);
443 c1Val = _mm_mul_ps(a1Val, b1Val);
444 c2Val = _mm_mul_ps(a2Val, b2Val);
445 c3Val = _mm_mul_ps(a3Val, b3Val);
447 dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
448 dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
449 dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
450 dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
456 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
457 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
458 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
461 _mm_store_ps(dotProductVector,dotProdVal0);
463 dotProduct = dotProductVector[0];
464 dotProduct += dotProductVector[1];
465 dotProduct += dotProductVector[2];
466 dotProduct += dotProductVector[3];
468 number = sixteenthPoints*16;
469 for(;number < num_points; number++){
470 dotProduct += ((*aPtr++) * (*bPtr++));
473 *result = dotProduct;
478 #ifdef LV_HAVE_SSE4_1
480 #include <smmintrin.h>
482 static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(
float * result,
const float * input,
const float* taps,
unsigned int num_points) {
483 unsigned int number = 0;
484 const unsigned int sixteenthPoints = num_points / 16;
486 float dotProduct = 0;
487 const float* aPtr = input;
488 const float* bPtr =
taps;
490 __m128 aVal1, bVal1, cVal1;
491 __m128 aVal2, bVal2, cVal2;
492 __m128 aVal3, bVal3, cVal3;
493 __m128 aVal4, bVal4, cVal4;
495 __m128 dotProdVal = _mm_setzero_ps();
497 for(;number < sixteenthPoints; number++){
499 aVal1 = _mm_load_ps(aPtr); aPtr += 4;
500 aVal2 = _mm_load_ps(aPtr); aPtr += 4;
501 aVal3 = _mm_load_ps(aPtr); aPtr += 4;
502 aVal4 = _mm_load_ps(aPtr); aPtr += 4;
504 bVal1 = _mm_load_ps(bPtr); bPtr += 4;
505 bVal2 = _mm_load_ps(bPtr); bPtr += 4;
506 bVal3 = _mm_load_ps(bPtr); bPtr += 4;
507 bVal4 = _mm_load_ps(bPtr); bPtr += 4;
509 cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
510 cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
511 cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
512 cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
514 cVal1 = _mm_or_ps(cVal1, cVal2);
515 cVal3 = _mm_or_ps(cVal3, cVal4);
516 cVal1 = _mm_or_ps(cVal1, cVal3);
518 dotProdVal = _mm_add_ps(dotProdVal, cVal1);
522 _mm_store_ps(dotProductVector, dotProdVal);
524 dotProduct = dotProductVector[0];
525 dotProduct += dotProductVector[1];
526 dotProduct += dotProductVector[2];
527 dotProduct += dotProductVector[3];
529 number = sixteenthPoints * 16;
530 for(;number < num_points; number++){
531 dotProduct += ((*aPtr++) * (*bPtr++));
534 *result = dotProduct;
541 #include <immintrin.h>
543 static inline void volk_32f_x2_dot_prod_32f_a_avx(
float* result,
const float* input,
const float* taps,
unsigned int num_points) {
545 unsigned int number = 0;
546 const unsigned int sixteenthPoints = num_points / 16;
548 float dotProduct = 0;
549 const float* aPtr = input;
550 const float* bPtr =
taps;
556 __m256 dotProdVal0 = _mm256_setzero_ps();
557 __m256 dotProdVal1 = _mm256_setzero_ps();
559 for(;number < sixteenthPoints; number++){
561 a0Val = _mm256_load_ps(aPtr);
562 a1Val = _mm256_load_ps(aPtr+8);
563 b0Val = _mm256_load_ps(bPtr);
564 b1Val = _mm256_load_ps(bPtr+8);
566 c0Val = _mm256_mul_ps(a0Val, b0Val);
567 c1Val = _mm256_mul_ps(a1Val, b1Val);
569 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
570 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
576 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
580 _mm256_store_ps(dotProductVector,dotProdVal0);
582 dotProduct = dotProductVector[0];
583 dotProduct += dotProductVector[1];
584 dotProduct += dotProductVector[2];
585 dotProduct += dotProductVector[3];
586 dotProduct += dotProductVector[4];
587 dotProduct += dotProductVector[5];
588 dotProduct += dotProductVector[6];
589 dotProduct += dotProductVector[7];
591 number = sixteenthPoints*16;
592 for(;number < num_points; number++){
593 dotProduct += ((*aPtr++) * (*bPtr++));
596 *result = dotProduct;
603 #include <arm_neon.h>
605 static inline void volk_32f_x2_dot_prod_32f_neonopts(
float * result,
const float * input,
const float * taps,
unsigned int num_points) {
607 unsigned int quarter_points = num_points / 16;
608 float dotProduct = 0;
609 const float* aPtr = input;
610 const float* bPtr=
taps;
611 unsigned int number = 0;
613 float32x4x4_t a_val, b_val, accumulator0;
614 accumulator0.val[0] = vdupq_n_f32(0);
615 accumulator0.val[1] = vdupq_n_f32(0);
616 accumulator0.val[2] = vdupq_n_f32(0);
617 accumulator0.val[3] = vdupq_n_f32(0);
620 for( number = 0; number < quarter_points; ++number) {
621 a_val = vld4q_f32(aPtr);
622 b_val = vld4q_f32(bPtr);
623 accumulator0.val[0] = vmlaq_f32(accumulator0.val[0], a_val.val[0], b_val.val[0]);
624 accumulator0.val[1] = vmlaq_f32(accumulator0.val[1], a_val.val[1], b_val.val[1]);
625 accumulator0.val[2] = vmlaq_f32(accumulator0.val[2], a_val.val[2], b_val.val[2]);
626 accumulator0.val[3] = vmlaq_f32(accumulator0.val[3], a_val.val[3], b_val.val[3]);
630 accumulator0.val[0] = vaddq_f32(accumulator0.val[0], accumulator0.val[1]);
631 accumulator0.val[2] = vaddq_f32(accumulator0.val[2], accumulator0.val[3]);
632 accumulator0.val[0] = vaddq_f32(accumulator0.val[2], accumulator0.val[0]);
634 vst1q_f32(accumulator, accumulator0.val[0]);
635 dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3];
637 for(number = quarter_points*16; number < num_points; number++){
638 dotProduct += ((*aPtr++) * (*bPtr++));
641 *result = dotProduct;
650 static inline void volk_32f_x2_dot_prod_32f_neon(
float * result,
const float * input,
const float * taps,
unsigned int num_points) {
652 unsigned int quarter_points = num_points / 8;
653 float dotProduct = 0;
654 const float* aPtr = input;
655 const float* bPtr=
taps;
656 unsigned int number = 0;
658 float32x4x2_t a_val, b_val, accumulator_val;
659 accumulator_val.val[0] = vdupq_n_f32(0);
660 accumulator_val.val[1] = vdupq_n_f32(0);
662 for( number = 0; number < quarter_points; ++number) {
663 a_val = vld2q_f32(aPtr);
664 b_val = vld2q_f32(bPtr);
665 accumulator_val.val[0] = vmlaq_f32(accumulator_val.val[0], a_val.val[0], b_val.val[0]);
666 accumulator_val.val[1] = vmlaq_f32(accumulator_val.val[1], a_val.val[1], b_val.val[1]);
670 accumulator_val.val[0] = vaddq_f32(accumulator_val.val[0], accumulator_val.val[1]);
672 vst1q_f32(accumulator, accumulator_val.val[0]);
673 dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3];
675 for(number = quarter_points*8; number < num_points; number++){
676 dotProduct += ((*aPtr++) * (*bPtr++));
679 *result = dotProduct;
685 extern void volk_32f_x2_dot_prod_32f_neonasm(
float* cVector,
const float* aVector,
const float* bVector,
unsigned int num_points);
689 extern void volk_32f_x2_dot_prod_32f_neonasm_opts(
float* cVector,
const float* aVector,
const float* bVector,
unsigned int num_points);
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:27
static const float taps[NSTEPS+1][NTAPS]
Definition: interpolator_taps.h:9