GNU Radio Manual and C++ API Reference  3.7.6.1
The Free & Open Software Radio Ecosystem
volk_32fc_32f_dot_prod_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 #ifndef INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
24 #define INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
25 
26 #include <volk/volk_common.h>
27 #include<stdio.h>
28 
29 
30 #ifdef LV_HAVE_GENERIC
31 
32 
33 static inline void volk_32fc_32f_dot_prod_32fc_generic(lv_32fc_t* result, const lv_32fc_t* input, const float * taps, unsigned int num_points) {
34 
35  float res[2];
36  float *realpt = &res[0], *imagpt = &res[1];
37  const float* aPtr = (float*)input;
38  const float* bPtr= taps;
39  unsigned int number = 0;
40 
41  *realpt = 0;
42  *imagpt = 0;
43 
44  for(number = 0; number < num_points; number++){
45  *realpt += ((*aPtr++) * (*bPtr));
46  *imagpt += ((*aPtr++) * (*bPtr++));
47  }
48 
49  *result = *(lv_32fc_t*)(&res[0]);
50 }
51 
52 #endif /*LV_HAVE_GENERIC*/
53 
54 
55 #ifdef LV_HAVE_AVX
56 
57 #include <immintrin.h>
58 
59 static inline void volk_32fc_32f_dot_prod_32fc_a_avx( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) {
60 
61  unsigned int number = 0;
62  const unsigned int sixteenthPoints = num_points / 16;
63 
64  float res[2];
65  float *realpt = &res[0], *imagpt = &res[1];
66  const float* aPtr = (float*)input;
67  const float* bPtr = taps;
68 
69  __m256 a0Val, a1Val, a2Val, a3Val;
70  __m256 b0Val, b1Val, b2Val, b3Val;
71  __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
72  __m256 c0Val, c1Val, c2Val, c3Val;
73 
74  __m256 dotProdVal0 = _mm256_setzero_ps();
75  __m256 dotProdVal1 = _mm256_setzero_ps();
76  __m256 dotProdVal2 = _mm256_setzero_ps();
77  __m256 dotProdVal3 = _mm256_setzero_ps();
78 
79  for(;number < sixteenthPoints; number++){
80 
81  a0Val = _mm256_load_ps(aPtr);
82  a1Val = _mm256_load_ps(aPtr+8);
83  a2Val = _mm256_load_ps(aPtr+16);
84  a3Val = _mm256_load_ps(aPtr+24);
85 
86  x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
87  x1Val = _mm256_load_ps(bPtr+8);
88  x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
89  x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
90  x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
91  x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
92 
93  // TODO: it may be possible to rearrange swizzling to better pipeline data
94  b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
95  b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
96  b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
97  b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
98 
99  c0Val = _mm256_mul_ps(a0Val, b0Val);
100  c1Val = _mm256_mul_ps(a1Val, b1Val);
101  c2Val = _mm256_mul_ps(a2Val, b2Val);
102  c3Val = _mm256_mul_ps(a3Val, b3Val);
103 
104  dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
105  dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
106  dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
107  dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
108 
109  aPtr += 32;
110  bPtr += 16;
111  }
112 
113  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
114  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
115  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
116 
117  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
118 
119  _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
120 
121  *realpt = dotProductVector[0];
122  *imagpt = dotProductVector[1];
123  *realpt += dotProductVector[2];
124  *imagpt += dotProductVector[3];
125  *realpt += dotProductVector[4];
126  *imagpt += dotProductVector[5];
127  *realpt += dotProductVector[6];
128  *imagpt += dotProductVector[7];
129 
130  number = sixteenthPoints*16;
131  for(;number < num_points; number++){
132  *realpt += ((*aPtr++) * (*bPtr));
133  *imagpt += ((*aPtr++) * (*bPtr++));
134  }
135 
136  *result = *(lv_32fc_t*)(&res[0]);
137 }
138 
139 #endif /*LV_HAVE_AVX*/
140 
141 
142 
143 
144 #ifdef LV_HAVE_SSE
145 
146 
147 static inline void volk_32fc_32f_dot_prod_32fc_a_sse( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) {
148 
149  unsigned int number = 0;
150  const unsigned int sixteenthPoints = num_points / 8;
151 
152  float res[2];
153  float *realpt = &res[0], *imagpt = &res[1];
154  const float* aPtr = (float*)input;
155  const float* bPtr = taps;
156 
157  __m128 a0Val, a1Val, a2Val, a3Val;
158  __m128 b0Val, b1Val, b2Val, b3Val;
159  __m128 x0Val, x1Val, x2Val, x3Val;
160  __m128 c0Val, c1Val, c2Val, c3Val;
161 
162  __m128 dotProdVal0 = _mm_setzero_ps();
163  __m128 dotProdVal1 = _mm_setzero_ps();
164  __m128 dotProdVal2 = _mm_setzero_ps();
165  __m128 dotProdVal3 = _mm_setzero_ps();
166 
167  for(;number < sixteenthPoints; number++){
168 
169  a0Val = _mm_load_ps(aPtr);
170  a1Val = _mm_load_ps(aPtr+4);
171  a2Val = _mm_load_ps(aPtr+8);
172  a3Val = _mm_load_ps(aPtr+12);
173 
174  x0Val = _mm_load_ps(bPtr);
175  x1Val = _mm_load_ps(bPtr);
176  x2Val = _mm_load_ps(bPtr+4);
177  x3Val = _mm_load_ps(bPtr+4);
178  b0Val = _mm_unpacklo_ps(x0Val, x1Val);
179  b1Val = _mm_unpackhi_ps(x0Val, x1Val);
180  b2Val = _mm_unpacklo_ps(x2Val, x3Val);
181  b3Val = _mm_unpackhi_ps(x2Val, x3Val);
182 
183  c0Val = _mm_mul_ps(a0Val, b0Val);
184  c1Val = _mm_mul_ps(a1Val, b1Val);
185  c2Val = _mm_mul_ps(a2Val, b2Val);
186  c3Val = _mm_mul_ps(a3Val, b3Val);
187 
188  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
189  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
190  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
191  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
192 
193  aPtr += 16;
194  bPtr += 8;
195  }
196 
197  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
198  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
199  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
200 
201  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
202 
203  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
204 
205  *realpt = dotProductVector[0];
206  *imagpt = dotProductVector[1];
207  *realpt += dotProductVector[2];
208  *imagpt += dotProductVector[3];
209 
210  number = sixteenthPoints*8;
211  for(;number < num_points; number++){
212  *realpt += ((*aPtr++) * (*bPtr));
213  *imagpt += ((*aPtr++) * (*bPtr++));
214  }
215 
216  *result = *(lv_32fc_t*)(&res[0]);
217 }
218 
219 #endif /*LV_HAVE_SSE*/
220 
221 
222 
223 #ifdef LV_HAVE_AVX
224 
225 #include <immintrin.h>
226 
227 static inline void volk_32fc_32f_dot_prod_32fc_u_avx( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) {
228 
229  unsigned int number = 0;
230  const unsigned int sixteenthPoints = num_points / 16;
231 
232  float res[2];
233  float *realpt = &res[0], *imagpt = &res[1];
234  const float* aPtr = (float*)input;
235  const float* bPtr = taps;
236 
237  __m256 a0Val, a1Val, a2Val, a3Val;
238  __m256 b0Val, b1Val, b2Val, b3Val;
239  __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
240  __m256 c0Val, c1Val, c2Val, c3Val;
241 
242  __m256 dotProdVal0 = _mm256_setzero_ps();
243  __m256 dotProdVal1 = _mm256_setzero_ps();
244  __m256 dotProdVal2 = _mm256_setzero_ps();
245  __m256 dotProdVal3 = _mm256_setzero_ps();
246 
247  for(;number < sixteenthPoints; number++){
248 
249  a0Val = _mm256_loadu_ps(aPtr);
250  a1Val = _mm256_loadu_ps(aPtr+8);
251  a2Val = _mm256_loadu_ps(aPtr+16);
252  a3Val = _mm256_loadu_ps(aPtr+24);
253 
254  x0Val = _mm256_loadu_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
255  x1Val = _mm256_loadu_ps(bPtr+8);
256  x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
257  x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
258  x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
259  x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
260 
261  // TODO: it may be possible to rearrange swizzling to better pipeline data
262  b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
263  b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
264  b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
265  b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
266 
267  c0Val = _mm256_mul_ps(a0Val, b0Val);
268  c1Val = _mm256_mul_ps(a1Val, b1Val);
269  c2Val = _mm256_mul_ps(a2Val, b2Val);
270  c3Val = _mm256_mul_ps(a3Val, b3Val);
271 
272  dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
273  dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
274  dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
275  dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
276 
277  aPtr += 32;
278  bPtr += 16;
279  }
280 
281  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
282  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
283  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
284 
285  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
286 
287  _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
288 
289  *realpt = dotProductVector[0];
290  *imagpt = dotProductVector[1];
291  *realpt += dotProductVector[2];
292  *imagpt += dotProductVector[3];
293  *realpt += dotProductVector[4];
294  *imagpt += dotProductVector[5];
295  *realpt += dotProductVector[6];
296  *imagpt += dotProductVector[7];
297 
298  number = sixteenthPoints*16;
299  for(;number < num_points; number++){
300  *realpt += ((*aPtr++) * (*bPtr));
301  *imagpt += ((*aPtr++) * (*bPtr++));
302  }
303 
304  *result = *(lv_32fc_t*)(&res[0]);
305 }
306 #endif /*LV_HAVE_AVX*/
307 
308 #ifdef LV_HAVE_NEON
309 #include <arm_neon.h>
310 
311 static inline void volk_32fc_32f_dot_prod_32fc_neon_unroll ( lv_32fc_t* __restrict result, const lv_32fc_t* __restrict input, const float* __restrict taps, unsigned int num_points) {
312 
313  unsigned int number;
314  const unsigned int quarterPoints = num_points / 8;
315 
316  float res[2];
317  float *realpt = &res[0], *imagpt = &res[1];
318  const float* inputPtr = (float*)input;
319  const float* tapsPtr = taps;
320  float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
321  float accVector_real[4];
322  float accVector_imag[4];
323 
324  float32x4x2_t inputVector0, inputVector1;
325  float32x4_t tapsVector0, tapsVector1;
326  float32x4_t tmp_real0, tmp_imag0;
327  float32x4_t tmp_real1, tmp_imag1;
328  float32x4_t real_accumulator0, imag_accumulator0;
329  float32x4_t real_accumulator1, imag_accumulator1;
330 
331  // zero out accumulators
332  // take a *float, return float32x4_t
333  real_accumulator0 = vld1q_f32( zero );
334  imag_accumulator0 = vld1q_f32( zero );
335  real_accumulator1 = vld1q_f32( zero );
336  imag_accumulator1 = vld1q_f32( zero );
337 
338  for(number=0 ;number < quarterPoints; number++){
339  // load doublewords and duplicate in to second lane
340  tapsVector0 = vld1q_f32(tapsPtr );
341  tapsVector1 = vld1q_f32(tapsPtr+4 );
342 
343  // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag
344  inputVector0 = vld2q_f32(inputPtr );
345  inputVector1 = vld2q_f32(inputPtr+8 );
346  // inputVector is now a struct of two vectors, 0th is real, 1st is imag
347 
348  tmp_real0 = vmulq_f32(tapsVector0, inputVector0.val[0]);
349  tmp_imag0 = vmulq_f32(tapsVector0, inputVector0.val[1]);
350 
351  tmp_real1 = vmulq_f32(tapsVector1, inputVector1.val[0]);
352  tmp_imag1 = vmulq_f32(tapsVector1, inputVector1.val[1]);
353 
354  real_accumulator0 = vaddq_f32(real_accumulator0, tmp_real0);
355  imag_accumulator0 = vaddq_f32(imag_accumulator0, tmp_imag0);
356 
357  real_accumulator1 = vaddq_f32(real_accumulator1, tmp_real1);
358  imag_accumulator1 = vaddq_f32(imag_accumulator1, tmp_imag1);
359 
360  tapsPtr += 8;
361  inputPtr += 16;
362  }
363 
364  real_accumulator0 = vaddq_f32( real_accumulator0, real_accumulator1);
365  imag_accumulator0 = vaddq_f32( imag_accumulator0, imag_accumulator1);
366  // void vst1q_f32( float32_t * ptr, float32x4_t val);
367  // store results back to a complex (array of 2 floats)
368  vst1q_f32(accVector_real, real_accumulator0);
369  vst1q_f32(accVector_imag, imag_accumulator0);
370  *realpt = accVector_real[0] + accVector_real[1] +
371  accVector_real[2] + accVector_real[3] ;
372 
373  *imagpt = accVector_imag[0] + accVector_imag[1] +
374  accVector_imag[2] + accVector_imag[3] ;
375 
376  // clean up the remainder
377  for(number=quarterPoints*8; number < num_points; number++){
378  *realpt += ((*inputPtr++) * (*tapsPtr));
379  *imagpt += ((*inputPtr++) * (*tapsPtr++));
380  }
381 
382  *result = *(lv_32fc_t*)(&res[0]);
383 }
384 
385 #endif /*LV_HAVE_NEON*/
386 
387 #ifdef LV_HAVE_NEON
388 #include <arm_neon.h>
389 
390 static inline void volk_32fc_32f_dot_prod_32fc_a_neon ( lv_32fc_t* __restrict result, const lv_32fc_t* __restrict input, const float* __restrict taps, unsigned int num_points) {
391 
392  unsigned int number;
393  const unsigned int quarterPoints = num_points / 4;
394 
395  float res[2];
396  float *realpt = &res[0], *imagpt = &res[1];
397  const float* inputPtr = (float*)input;
398  const float* tapsPtr = taps;
399  float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
400  float accVector_real[4];
401  float accVector_imag[4];
402 
403  float32x4x2_t inputVector;
404  float32x4_t tapsVector;
405  float32x4_t tmp_real, tmp_imag;
406  float32x4_t real_accumulator, imag_accumulator;
407 
408 
409  // zero out accumulators
410  // take a *float, return float32x4_t
411  real_accumulator = vld1q_f32( zero );
412  imag_accumulator = vld1q_f32( zero );
413 
414  for(number=0 ;number < quarterPoints; number++){
415  // load taps ( float32x2x2_t = vld1q_f32( float32_t const * ptr) )
416  // load doublewords and duplicate in to second lane
417  tapsVector = vld1q_f32(tapsPtr );
418 
419  // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag
420  inputVector = vld2q_f32(inputPtr );
421 
422  tmp_real = vmulq_f32(tapsVector, inputVector.val[0]);
423  tmp_imag = vmulq_f32(tapsVector, inputVector.val[1]);
424 
425  real_accumulator = vaddq_f32(real_accumulator, tmp_real);
426  imag_accumulator = vaddq_f32(imag_accumulator, tmp_imag);
427 
428 
429  tapsPtr += 4;
430  inputPtr += 8;
431 
432  }
433 
434  // store results back to a complex (array of 2 floats)
435  vst1q_f32(accVector_real, real_accumulator);
436  vst1q_f32(accVector_imag, imag_accumulator);
437  *realpt = accVector_real[0] + accVector_real[1] +
438  accVector_real[2] + accVector_real[3] ;
439 
440  *imagpt = accVector_imag[0] + accVector_imag[1] +
441  accVector_imag[2] + accVector_imag[3] ;
442 
443  // clean up the remainder
444  for(number=quarterPoints*4; number < num_points; number++){
445  *realpt += ((*inputPtr++) * (*tapsPtr));
446  *imagpt += ((*inputPtr++) * (*tapsPtr++));
447  }
448 
449  *result = *(lv_32fc_t*)(&res[0]);
450 }
451 
452 #endif /*LV_HAVE_NEON*/
453 
454 #ifdef LV_HAVE_NEON
455 extern void volk_32fc_32f_dot_prod_32fc_a_neonasm ( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points);
456 #endif /*LV_HAVE_NEON*/
457 
458 #ifdef LV_HAVE_NEON
459 extern void volk_32fc_32f_dot_prod_32fc_a_neonpipeline ( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points);
460 #endif /*LV_HAVE_NEON*/
461 
462 #ifdef LV_HAVE_SSE
463 
464 static inline void volk_32fc_32f_dot_prod_32fc_u_sse( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) {
465 
466  unsigned int number = 0;
467  const unsigned int sixteenthPoints = num_points / 8;
468 
469  float res[2];
470  float *realpt = &res[0], *imagpt = &res[1];
471  const float* aPtr = (float*)input;
472  const float* bPtr = taps;
473 
474  __m128 a0Val, a1Val, a2Val, a3Val;
475  __m128 b0Val, b1Val, b2Val, b3Val;
476  __m128 x0Val, x1Val, x2Val, x3Val;
477  __m128 c0Val, c1Val, c2Val, c3Val;
478 
479  __m128 dotProdVal0 = _mm_setzero_ps();
480  __m128 dotProdVal1 = _mm_setzero_ps();
481  __m128 dotProdVal2 = _mm_setzero_ps();
482  __m128 dotProdVal3 = _mm_setzero_ps();
483 
484  for(;number < sixteenthPoints; number++){
485 
486  a0Val = _mm_loadu_ps(aPtr);
487  a1Val = _mm_loadu_ps(aPtr+4);
488  a2Val = _mm_loadu_ps(aPtr+8);
489  a3Val = _mm_loadu_ps(aPtr+12);
490 
491  x0Val = _mm_loadu_ps(bPtr);
492  x1Val = _mm_loadu_ps(bPtr);
493  x2Val = _mm_loadu_ps(bPtr+4);
494  x3Val = _mm_loadu_ps(bPtr+4);
495  b0Val = _mm_unpacklo_ps(x0Val, x1Val);
496  b1Val = _mm_unpackhi_ps(x0Val, x1Val);
497  b2Val = _mm_unpacklo_ps(x2Val, x3Val);
498  b3Val = _mm_unpackhi_ps(x2Val, x3Val);
499 
500  c0Val = _mm_mul_ps(a0Val, b0Val);
501  c1Val = _mm_mul_ps(a1Val, b1Val);
502  c2Val = _mm_mul_ps(a2Val, b2Val);
503  c3Val = _mm_mul_ps(a3Val, b3Val);
504 
505  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
506  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
507  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
508  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
509 
510  aPtr += 16;
511  bPtr += 8;
512  }
513 
514  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
515  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
516  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
517 
518  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
519 
520  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
521 
522  *realpt = dotProductVector[0];
523  *imagpt = dotProductVector[1];
524  *realpt += dotProductVector[2];
525  *imagpt += dotProductVector[3];
526 
527  number = sixteenthPoints*8;
528  for(;number < num_points; number++){
529  *realpt += ((*aPtr++) * (*bPtr));
530  *imagpt += ((*aPtr++) * (*bPtr++));
531  }
532 
533  *result = *(lv_32fc_t*)(&res[0]);
534 }
535 
536 #endif /*LV_HAVE_SSE*/
537 
538 
539 #endif /*INCLUDED_volk_32fc_32f_dot_prod_32fc_H*/
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:27
static const float taps[NSTEPS+1][NTAPS]
Definition: interpolator_taps.h:9
float complex lv_32fc_t
Definition: volk_complex.h:56