GNU Radio Manual and C++ API Reference  3.7.6.1
The Free & Open Software Radio Ecosystem
volk_32f_x2_dot_prod_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H
24 #define INCLUDED_volk_32f_x2_dot_prod_32f_u_H
25 
26 #include <volk/volk_common.h>
27 #include<stdio.h>
28 
29 
30 #ifdef LV_HAVE_GENERIC
31 
32 
33 static inline void volk_32f_x2_dot_prod_32f_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
34 
35  float dotProduct = 0;
36  const float* aPtr = input;
37  const float* bPtr= taps;
38  unsigned int number = 0;
39 
40  for(number = 0; number < num_points; number++){
41  dotProduct += ((*aPtr++) * (*bPtr++));
42  }
43 
44  *result = dotProduct;
45 }
46 
47 #endif /*LV_HAVE_GENERIC*/
48 
49 
50 #ifdef LV_HAVE_SSE
51 
52 
53 static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* input, const float* taps, unsigned int num_points) {
54 
55  unsigned int number = 0;
56  const unsigned int sixteenthPoints = num_points / 16;
57 
58  float dotProduct = 0;
59  const float* aPtr = input;
60  const float* bPtr = taps;
61 
62  __m128 a0Val, a1Val, a2Val, a3Val;
63  __m128 b0Val, b1Val, b2Val, b3Val;
64  __m128 c0Val, c1Val, c2Val, c3Val;
65 
66  __m128 dotProdVal0 = _mm_setzero_ps();
67  __m128 dotProdVal1 = _mm_setzero_ps();
68  __m128 dotProdVal2 = _mm_setzero_ps();
69  __m128 dotProdVal3 = _mm_setzero_ps();
70 
71  for(;number < sixteenthPoints; number++){
72 
73  a0Val = _mm_loadu_ps(aPtr);
74  a1Val = _mm_loadu_ps(aPtr+4);
75  a2Val = _mm_loadu_ps(aPtr+8);
76  a3Val = _mm_loadu_ps(aPtr+12);
77  b0Val = _mm_loadu_ps(bPtr);
78  b1Val = _mm_loadu_ps(bPtr+4);
79  b2Val = _mm_loadu_ps(bPtr+8);
80  b3Val = _mm_loadu_ps(bPtr+12);
81 
82  c0Val = _mm_mul_ps(a0Val, b0Val);
83  c1Val = _mm_mul_ps(a1Val, b1Val);
84  c2Val = _mm_mul_ps(a2Val, b2Val);
85  c3Val = _mm_mul_ps(a3Val, b3Val);
86 
87  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
88  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
89  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
90  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
91 
92  aPtr += 16;
93  bPtr += 16;
94  }
95 
96  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
97  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
98  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
99 
100  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
101 
102  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
103 
104  dotProduct = dotProductVector[0];
105  dotProduct += dotProductVector[1];
106  dotProduct += dotProductVector[2];
107  dotProduct += dotProductVector[3];
108 
109  number = sixteenthPoints*16;
110  for(;number < num_points; number++){
111  dotProduct += ((*aPtr++) * (*bPtr++));
112  }
113 
114  *result = dotProduct;
115 
116 }
117 
118 #endif /*LV_HAVE_SSE*/
119 
120 #ifdef LV_HAVE_SSE3
121 
122 #include <pmmintrin.h>
123 
124 static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
125  unsigned int number = 0;
126  const unsigned int sixteenthPoints = num_points / 16;
127 
128  float dotProduct = 0;
129  const float* aPtr = input;
130  const float* bPtr = taps;
131 
132  __m128 a0Val, a1Val, a2Val, a3Val;
133  __m128 b0Val, b1Val, b2Val, b3Val;
134  __m128 c0Val, c1Val, c2Val, c3Val;
135 
136  __m128 dotProdVal0 = _mm_setzero_ps();
137  __m128 dotProdVal1 = _mm_setzero_ps();
138  __m128 dotProdVal2 = _mm_setzero_ps();
139  __m128 dotProdVal3 = _mm_setzero_ps();
140 
141  for(;number < sixteenthPoints; number++){
142 
143  a0Val = _mm_loadu_ps(aPtr);
144  a1Val = _mm_loadu_ps(aPtr+4);
145  a2Val = _mm_loadu_ps(aPtr+8);
146  a3Val = _mm_loadu_ps(aPtr+12);
147  b0Val = _mm_loadu_ps(bPtr);
148  b1Val = _mm_loadu_ps(bPtr+4);
149  b2Val = _mm_loadu_ps(bPtr+8);
150  b3Val = _mm_loadu_ps(bPtr+12);
151 
152  c0Val = _mm_mul_ps(a0Val, b0Val);
153  c1Val = _mm_mul_ps(a1Val, b1Val);
154  c2Val = _mm_mul_ps(a2Val, b2Val);
155  c3Val = _mm_mul_ps(a3Val, b3Val);
156 
157  dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
158  dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
159  dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
160  dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
161 
162  aPtr += 16;
163  bPtr += 16;
164  }
165 
166  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
167  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
168  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
169 
170  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
171  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
172 
173  dotProduct = dotProductVector[0];
174  dotProduct += dotProductVector[1];
175  dotProduct += dotProductVector[2];
176  dotProduct += dotProductVector[3];
177 
178  number = sixteenthPoints*16;
179  for(;number < num_points; number++){
180  dotProduct += ((*aPtr++) * (*bPtr++));
181  }
182 
183  *result = dotProduct;
184 }
185 
186 #endif /*LV_HAVE_SSE3*/
187 
188 #ifdef LV_HAVE_SSE4_1
189 
190 #include <smmintrin.h>
191 
192 static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
193  unsigned int number = 0;
194  const unsigned int sixteenthPoints = num_points / 16;
195 
196  float dotProduct = 0;
197  const float* aPtr = input;
198  const float* bPtr = taps;
199 
200  __m128 aVal1, bVal1, cVal1;
201  __m128 aVal2, bVal2, cVal2;
202  __m128 aVal3, bVal3, cVal3;
203  __m128 aVal4, bVal4, cVal4;
204 
205  __m128 dotProdVal = _mm_setzero_ps();
206 
207  for(;number < sixteenthPoints; number++){
208 
209  aVal1 = _mm_loadu_ps(aPtr); aPtr += 4;
210  aVal2 = _mm_loadu_ps(aPtr); aPtr += 4;
211  aVal3 = _mm_loadu_ps(aPtr); aPtr += 4;
212  aVal4 = _mm_loadu_ps(aPtr); aPtr += 4;
213 
214  bVal1 = _mm_loadu_ps(bPtr); bPtr += 4;
215  bVal2 = _mm_loadu_ps(bPtr); bPtr += 4;
216  bVal3 = _mm_loadu_ps(bPtr); bPtr += 4;
217  bVal4 = _mm_loadu_ps(bPtr); bPtr += 4;
218 
219  cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
220  cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
221  cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
222  cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
223 
224  cVal1 = _mm_or_ps(cVal1, cVal2);
225  cVal3 = _mm_or_ps(cVal3, cVal4);
226  cVal1 = _mm_or_ps(cVal1, cVal3);
227 
228  dotProdVal = _mm_add_ps(dotProdVal, cVal1);
229  }
230 
231  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
232  _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
233 
234  dotProduct = dotProductVector[0];
235  dotProduct += dotProductVector[1];
236  dotProduct += dotProductVector[2];
237  dotProduct += dotProductVector[3];
238 
239  number = sixteenthPoints * 16;
240  for(;number < num_points; number++){
241  dotProduct += ((*aPtr++) * (*bPtr++));
242  }
243 
244  *result = dotProduct;
245 }
246 
247 #endif /*LV_HAVE_SSE4_1*/
248 
249 #ifdef LV_HAVE_AVX
250 
251 #include <immintrin.h>
252 
253 static inline void volk_32f_x2_dot_prod_32f_u_avx( float* result, const float* input, const float* taps, unsigned int num_points) {
254 
255  unsigned int number = 0;
256  const unsigned int sixteenthPoints = num_points / 16;
257 
258  float dotProduct = 0;
259  const float* aPtr = input;
260  const float* bPtr = taps;
261 
262  __m256 a0Val, a1Val;
263  __m256 b0Val, b1Val;
264  __m256 c0Val, c1Val;
265 
266  __m256 dotProdVal0 = _mm256_setzero_ps();
267  __m256 dotProdVal1 = _mm256_setzero_ps();
268 
269  for(;number < sixteenthPoints; number++){
270 
271  a0Val = _mm256_loadu_ps(aPtr);
272  a1Val = _mm256_loadu_ps(aPtr+8);
273  b0Val = _mm256_loadu_ps(bPtr);
274  b1Val = _mm256_loadu_ps(bPtr+8);
275 
276  c0Val = _mm256_mul_ps(a0Val, b0Val);
277  c1Val = _mm256_mul_ps(a1Val, b1Val);
278 
279  dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
280  dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
281 
282  aPtr += 16;
283  bPtr += 16;
284  }
285 
286  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
287 
288  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
289 
290  _mm256_storeu_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
291 
292  dotProduct = dotProductVector[0];
293  dotProduct += dotProductVector[1];
294  dotProduct += dotProductVector[2];
295  dotProduct += dotProductVector[3];
296  dotProduct += dotProductVector[4];
297  dotProduct += dotProductVector[5];
298  dotProduct += dotProductVector[6];
299  dotProduct += dotProductVector[7];
300 
301  number = sixteenthPoints*16;
302  for(;number < num_points; number++){
303  dotProduct += ((*aPtr++) * (*bPtr++));
304  }
305 
306  *result = dotProduct;
307 
308 }
309 
310 #endif /*LV_HAVE_AVX*/
311 
312 #endif /*INCLUDED_volk_32f_x2_dot_prod_32f_u_H*/
313 #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a_H
314 #define INCLUDED_volk_32f_x2_dot_prod_32f_a_H
315 
316 #include <volk/volk_common.h>
317 #include<stdio.h>
318 
319 
320 #ifdef LV_HAVE_GENERIC
321 
322 
323 static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
324 
325  float dotProduct = 0;
326  const float* aPtr = input;
327  const float* bPtr= taps;
328  unsigned int number = 0;
329 
330  for(number = 0; number < num_points; number++){
331  dotProduct += ((*aPtr++) * (*bPtr++));
332  }
333 
334  *result = dotProduct;
335 }
336 
337 #endif /*LV_HAVE_GENERIC*/
338 
339 
340 #ifdef LV_HAVE_SSE
341 
342 
343 static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* input, const float* taps, unsigned int num_points) {
344 
345  unsigned int number = 0;
346  const unsigned int sixteenthPoints = num_points / 16;
347 
348  float dotProduct = 0;
349  const float* aPtr = input;
350  const float* bPtr = taps;
351 
352  __m128 a0Val, a1Val, a2Val, a3Val;
353  __m128 b0Val, b1Val, b2Val, b3Val;
354  __m128 c0Val, c1Val, c2Val, c3Val;
355 
356  __m128 dotProdVal0 = _mm_setzero_ps();
357  __m128 dotProdVal1 = _mm_setzero_ps();
358  __m128 dotProdVal2 = _mm_setzero_ps();
359  __m128 dotProdVal3 = _mm_setzero_ps();
360 
361  for(;number < sixteenthPoints; number++){
362 
363  a0Val = _mm_load_ps(aPtr);
364  a1Val = _mm_load_ps(aPtr+4);
365  a2Val = _mm_load_ps(aPtr+8);
366  a3Val = _mm_load_ps(aPtr+12);
367  b0Val = _mm_load_ps(bPtr);
368  b1Val = _mm_load_ps(bPtr+4);
369  b2Val = _mm_load_ps(bPtr+8);
370  b3Val = _mm_load_ps(bPtr+12);
371 
372  c0Val = _mm_mul_ps(a0Val, b0Val);
373  c1Val = _mm_mul_ps(a1Val, b1Val);
374  c2Val = _mm_mul_ps(a2Val, b2Val);
375  c3Val = _mm_mul_ps(a3Val, b3Val);
376 
377  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
378  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
379  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
380  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
381 
382  aPtr += 16;
383  bPtr += 16;
384  }
385 
386  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
387  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
388  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
389 
390  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
391 
392  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
393 
394  dotProduct = dotProductVector[0];
395  dotProduct += dotProductVector[1];
396  dotProduct += dotProductVector[2];
397  dotProduct += dotProductVector[3];
398 
399  number = sixteenthPoints*16;
400  for(;number < num_points; number++){
401  dotProduct += ((*aPtr++) * (*bPtr++));
402  }
403 
404  *result = dotProduct;
405 
406 }
407 
408 #endif /*LV_HAVE_SSE*/
409 
410 #ifdef LV_HAVE_SSE3
411 
412 #include <pmmintrin.h>
413 
414 static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
415  unsigned int number = 0;
416  const unsigned int sixteenthPoints = num_points / 16;
417 
418  float dotProduct = 0;
419  const float* aPtr = input;
420  const float* bPtr = taps;
421 
422  __m128 a0Val, a1Val, a2Val, a3Val;
423  __m128 b0Val, b1Val, b2Val, b3Val;
424  __m128 c0Val, c1Val, c2Val, c3Val;
425 
426  __m128 dotProdVal0 = _mm_setzero_ps();
427  __m128 dotProdVal1 = _mm_setzero_ps();
428  __m128 dotProdVal2 = _mm_setzero_ps();
429  __m128 dotProdVal3 = _mm_setzero_ps();
430 
431  for(;number < sixteenthPoints; number++){
432 
433  a0Val = _mm_load_ps(aPtr);
434  a1Val = _mm_load_ps(aPtr+4);
435  a2Val = _mm_load_ps(aPtr+8);
436  a3Val = _mm_load_ps(aPtr+12);
437  b0Val = _mm_load_ps(bPtr);
438  b1Val = _mm_load_ps(bPtr+4);
439  b2Val = _mm_load_ps(bPtr+8);
440  b3Val = _mm_load_ps(bPtr+12);
441 
442  c0Val = _mm_mul_ps(a0Val, b0Val);
443  c1Val = _mm_mul_ps(a1Val, b1Val);
444  c2Val = _mm_mul_ps(a2Val, b2Val);
445  c3Val = _mm_mul_ps(a3Val, b3Val);
446 
447  dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
448  dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
449  dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
450  dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
451 
452  aPtr += 16;
453  bPtr += 16;
454  }
455 
456  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
457  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
458  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
459 
460  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
461  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
462 
463  dotProduct = dotProductVector[0];
464  dotProduct += dotProductVector[1];
465  dotProduct += dotProductVector[2];
466  dotProduct += dotProductVector[3];
467 
468  number = sixteenthPoints*16;
469  for(;number < num_points; number++){
470  dotProduct += ((*aPtr++) * (*bPtr++));
471  }
472 
473  *result = dotProduct;
474 }
475 
476 #endif /*LV_HAVE_SSE3*/
477 
478 #ifdef LV_HAVE_SSE4_1
479 
480 #include <smmintrin.h>
481 
482 static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
483  unsigned int number = 0;
484  const unsigned int sixteenthPoints = num_points / 16;
485 
486  float dotProduct = 0;
487  const float* aPtr = input;
488  const float* bPtr = taps;
489 
490  __m128 aVal1, bVal1, cVal1;
491  __m128 aVal2, bVal2, cVal2;
492  __m128 aVal3, bVal3, cVal3;
493  __m128 aVal4, bVal4, cVal4;
494 
495  __m128 dotProdVal = _mm_setzero_ps();
496 
497  for(;number < sixteenthPoints; number++){
498 
499  aVal1 = _mm_load_ps(aPtr); aPtr += 4;
500  aVal2 = _mm_load_ps(aPtr); aPtr += 4;
501  aVal3 = _mm_load_ps(aPtr); aPtr += 4;
502  aVal4 = _mm_load_ps(aPtr); aPtr += 4;
503 
504  bVal1 = _mm_load_ps(bPtr); bPtr += 4;
505  bVal2 = _mm_load_ps(bPtr); bPtr += 4;
506  bVal3 = _mm_load_ps(bPtr); bPtr += 4;
507  bVal4 = _mm_load_ps(bPtr); bPtr += 4;
508 
509  cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
510  cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
511  cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
512  cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
513 
514  cVal1 = _mm_or_ps(cVal1, cVal2);
515  cVal3 = _mm_or_ps(cVal3, cVal4);
516  cVal1 = _mm_or_ps(cVal1, cVal3);
517 
518  dotProdVal = _mm_add_ps(dotProdVal, cVal1);
519  }
520 
521  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
522  _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
523 
524  dotProduct = dotProductVector[0];
525  dotProduct += dotProductVector[1];
526  dotProduct += dotProductVector[2];
527  dotProduct += dotProductVector[3];
528 
529  number = sixteenthPoints * 16;
530  for(;number < num_points; number++){
531  dotProduct += ((*aPtr++) * (*bPtr++));
532  }
533 
534  *result = dotProduct;
535 }
536 
537 #endif /*LV_HAVE_SSE4_1*/
538 
539 #ifdef LV_HAVE_AVX
540 
541 #include <immintrin.h>
542 
543 static inline void volk_32f_x2_dot_prod_32f_a_avx( float* result, const float* input, const float* taps, unsigned int num_points) {
544 
545  unsigned int number = 0;
546  const unsigned int sixteenthPoints = num_points / 16;
547 
548  float dotProduct = 0;
549  const float* aPtr = input;
550  const float* bPtr = taps;
551 
552  __m256 a0Val, a1Val;
553  __m256 b0Val, b1Val;
554  __m256 c0Val, c1Val;
555 
556  __m256 dotProdVal0 = _mm256_setzero_ps();
557  __m256 dotProdVal1 = _mm256_setzero_ps();
558 
559  for(;number < sixteenthPoints; number++){
560 
561  a0Val = _mm256_load_ps(aPtr);
562  a1Val = _mm256_load_ps(aPtr+8);
563  b0Val = _mm256_load_ps(bPtr);
564  b1Val = _mm256_load_ps(bPtr+8);
565 
566  c0Val = _mm256_mul_ps(a0Val, b0Val);
567  c1Val = _mm256_mul_ps(a1Val, b1Val);
568 
569  dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
570  dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
571 
572  aPtr += 16;
573  bPtr += 16;
574  }
575 
576  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
577 
578  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
579 
580  _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
581 
582  dotProduct = dotProductVector[0];
583  dotProduct += dotProductVector[1];
584  dotProduct += dotProductVector[2];
585  dotProduct += dotProductVector[3];
586  dotProduct += dotProductVector[4];
587  dotProduct += dotProductVector[5];
588  dotProduct += dotProductVector[6];
589  dotProduct += dotProductVector[7];
590 
591  number = sixteenthPoints*16;
592  for(;number < num_points; number++){
593  dotProduct += ((*aPtr++) * (*bPtr++));
594  }
595 
596  *result = dotProduct;
597 
598 }
599 
600 #endif /*LV_HAVE_AVX*/
601 
602 #ifdef LV_HAVE_NEON
603 #include <arm_neon.h>
604 
605 static inline void volk_32f_x2_dot_prod_32f_neonopts(float * result, const float * input, const float * taps, unsigned int num_points) {
606 
607  unsigned int quarter_points = num_points / 16;
608  float dotProduct = 0;
609  const float* aPtr = input;
610  const float* bPtr= taps;
611  unsigned int number = 0;
612 
613  float32x4x4_t a_val, b_val, accumulator0;
614  accumulator0.val[0] = vdupq_n_f32(0);
615  accumulator0.val[1] = vdupq_n_f32(0);
616  accumulator0.val[2] = vdupq_n_f32(0);
617  accumulator0.val[3] = vdupq_n_f32(0);
618  // factor of 4 loop unroll with independent accumulators
619  // uses 12 out of 16 neon q registers
620  for( number = 0; number < quarter_points; ++number) {
621  a_val = vld4q_f32(aPtr);
622  b_val = vld4q_f32(bPtr);
623  accumulator0.val[0] = vmlaq_f32(accumulator0.val[0], a_val.val[0], b_val.val[0]);
624  accumulator0.val[1] = vmlaq_f32(accumulator0.val[1], a_val.val[1], b_val.val[1]);
625  accumulator0.val[2] = vmlaq_f32(accumulator0.val[2], a_val.val[2], b_val.val[2]);
626  accumulator0.val[3] = vmlaq_f32(accumulator0.val[3], a_val.val[3], b_val.val[3]);
627  aPtr += 16;
628  bPtr += 16;
629  }
630  accumulator0.val[0] = vaddq_f32(accumulator0.val[0], accumulator0.val[1]);
631  accumulator0.val[2] = vaddq_f32(accumulator0.val[2], accumulator0.val[3]);
632  accumulator0.val[0] = vaddq_f32(accumulator0.val[2], accumulator0.val[0]);
633  __VOLK_ATTR_ALIGNED(32) float accumulator[4];
634  vst1q_f32(accumulator, accumulator0.val[0]);
635  dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3];
636 
637  for(number = quarter_points*16; number < num_points; number++){
638  dotProduct += ((*aPtr++) * (*bPtr++));
639  }
640 
641  *result = dotProduct;
642 }
643 
644 #endif
645 
646 
647 
648 
649 #ifdef LV_HAVE_NEON
650 static inline void volk_32f_x2_dot_prod_32f_neon(float * result, const float * input, const float * taps, unsigned int num_points) {
651 
652  unsigned int quarter_points = num_points / 8;
653  float dotProduct = 0;
654  const float* aPtr = input;
655  const float* bPtr= taps;
656  unsigned int number = 0;
657 
658  float32x4x2_t a_val, b_val, accumulator_val;
659  accumulator_val.val[0] = vdupq_n_f32(0);
660  accumulator_val.val[1] = vdupq_n_f32(0);
661  // factor of 2 loop unroll with independent accumulators
662  for( number = 0; number < quarter_points; ++number) {
663  a_val = vld2q_f32(aPtr);
664  b_val = vld2q_f32(bPtr);
665  accumulator_val.val[0] = vmlaq_f32(accumulator_val.val[0], a_val.val[0], b_val.val[0]);
666  accumulator_val.val[1] = vmlaq_f32(accumulator_val.val[1], a_val.val[1], b_val.val[1]);
667  aPtr += 8;
668  bPtr += 8;
669  }
670  accumulator_val.val[0] = vaddq_f32(accumulator_val.val[0], accumulator_val.val[1]);
671  __VOLK_ATTR_ALIGNED(32) float accumulator[4];
672  vst1q_f32(accumulator, accumulator_val.val[0]);
673  dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3];
674 
675  for(number = quarter_points*8; number < num_points; number++){
676  dotProduct += ((*aPtr++) * (*bPtr++));
677  }
678 
679  *result = dotProduct;
680 }
681 
682 #endif /* LV_HAVE_NEON */
683 
684 #ifdef LV_HAVE_NEON
685 extern void volk_32f_x2_dot_prod_32f_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
686 #endif /* LV_HAVE_NEON */
687 
688 #ifdef LV_HAVE_NEON
689 extern void volk_32f_x2_dot_prod_32f_neonasm_opts(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
690 #endif /* LV_HAVE_NEON */
691 
692 #endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a_H*/
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:27
static const float taps[NSTEPS+1][NTAPS]
Definition: interpolator_taps.h:9