GNU Radio Manual and C++ API Reference  3.7.6.1
The Free & Open Software Radio Ecosystem
volk_16i_32fc_dot_prod_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 #ifndef INCLUDED_volk_16i_32fc_dot_prod_32fc_H
24 #define INCLUDED_volk_16i_32fc_dot_prod_32fc_H
25 
26 #include <volk/volk_common.h>
27 #include<stdio.h>
28 
29 
30 #ifdef LV_HAVE_GENERIC
31 
32 static inline void volk_16i_32fc_dot_prod_32fc_generic(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points) {
33 
34  static const int N_UNROLL = 4;
35 
36  lv_32fc_t acc0 = 0;
37  lv_32fc_t acc1 = 0;
38  lv_32fc_t acc2 = 0;
39  lv_32fc_t acc3 = 0;
40 
41  unsigned i = 0;
42  unsigned n = (num_points / N_UNROLL) * N_UNROLL;
43 
44  for(i = 0; i < n; i += N_UNROLL) {
45  acc0 += taps[i + 0] * (float)input[i + 0];
46  acc1 += taps[i + 1] * (float)input[i + 1];
47  acc2 += taps[i + 2] * (float)input[i + 2];
48  acc3 += taps[i + 3] * (float)input[i + 3];
49  }
50 
51  for(; i < num_points; i++) {
52  acc0 += taps[i] * (float)input[i];
53  }
54 
55  *result = acc0 + acc1 + acc2 + acc3;
56 }
57 
58 #endif /*LV_HAVE_GENERIC*/
59 
60 #ifdef LV_HAVE_NEON
61 #include <arm_neon.h>
62 static inline void volk_16i_32fc_dot_prod_32fc_neon(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points) {
63 
64  unsigned ii;
65  unsigned quarter_points = num_points / 4;
66  lv_32fc_t* tapsPtr = (lv_32fc_t*) taps;
67  short* inputPtr = (short*) input;
68  lv_32fc_t accumulator_vec[4];
69 
70  float32x4x2_t tapsVal, accumulator_val;
71  int16x4_t input16;
72  int32x4_t input32;
73  float32x4_t input_float, prod_re, prod_im;
74 
75  accumulator_val.val[0] = vdupq_n_f32(0.0);
76  accumulator_val.val[1] = vdupq_n_f32(0.0);
77 
78  for(ii = 0; ii < quarter_points; ++ii) {
79  tapsVal = vld2q_f32((float*)tapsPtr);
80  input16 = vld1_s16(inputPtr);
81  // widen 16-bit int to 32-bit int
82  input32 = vmovl_s16(input16);
83  // convert 32-bit int to float with scale
84  input_float = vcvtq_f32_s32(input32);
85 
86  prod_re = vmulq_f32(input_float, tapsVal.val[0]);
87  prod_im = vmulq_f32(input_float, tapsVal.val[1]);
88 
89  accumulator_val.val[0] = vaddq_f32(prod_re, accumulator_val.val[0]);
90  accumulator_val.val[1] = vaddq_f32(prod_im, accumulator_val.val[1]);
91 
92  tapsPtr += 4;
93  inputPtr += 4;
94  }
95  vst2q_f32((float*)accumulator_vec, accumulator_val);
96  accumulator_vec[0] += accumulator_vec[1];
97  accumulator_vec[2] += accumulator_vec[3];
98  accumulator_vec[0] += accumulator_vec[2];
99 
100  for(ii = quarter_points * 4; ii < num_points; ++ii) {
101  accumulator_vec[0] += *(tapsPtr++) * (float)(*(inputPtr++));
102  }
103 
104  *result = accumulator_vec[0];
105 }
106 
107 #endif /*LV_HAVE_NEON*/
108 
109 #if LV_HAVE_SSE && LV_HAVE_MMX
110 
111 static inline void volk_16i_32fc_dot_prod_32fc_u_sse( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) {
112 
113  unsigned int number = 0;
114  const unsigned int sixteenthPoints = num_points / 8;
115 
116  float res[2];
117  float *realpt = &res[0], *imagpt = &res[1];
118  const short* aPtr = input;
119  const float* bPtr = (float*)taps;
120 
121  __m64 m0, m1;
122  __m128 f0, f1, f2, f3;
123  __m128 a0Val, a1Val, a2Val, a3Val;
124  __m128 b0Val, b1Val, b2Val, b3Val;
125  __m128 c0Val, c1Val, c2Val, c3Val;
126 
127  __m128 dotProdVal0 = _mm_setzero_ps();
128  __m128 dotProdVal1 = _mm_setzero_ps();
129  __m128 dotProdVal2 = _mm_setzero_ps();
130  __m128 dotProdVal3 = _mm_setzero_ps();
131 
132  for(;number < sixteenthPoints; number++){
133 
134  m0 = _mm_set_pi16(*(aPtr+3), *(aPtr+2), *(aPtr+1), *(aPtr+0));
135  m1 = _mm_set_pi16(*(aPtr+7), *(aPtr+6), *(aPtr+5), *(aPtr+4));
136  f0 = _mm_cvtpi16_ps(m0);
137  f1 = _mm_cvtpi16_ps(m0);
138  f2 = _mm_cvtpi16_ps(m1);
139  f3 = _mm_cvtpi16_ps(m1);
140 
141  a0Val = _mm_unpacklo_ps(f0, f1);
142  a1Val = _mm_unpackhi_ps(f0, f1);
143  a2Val = _mm_unpacklo_ps(f2, f3);
144  a3Val = _mm_unpackhi_ps(f2, f3);
145 
146  b0Val = _mm_loadu_ps(bPtr);
147  b1Val = _mm_loadu_ps(bPtr+4);
148  b2Val = _mm_loadu_ps(bPtr+8);
149  b3Val = _mm_loadu_ps(bPtr+12);
150 
151  c0Val = _mm_mul_ps(a0Val, b0Val);
152  c1Val = _mm_mul_ps(a1Val, b1Val);
153  c2Val = _mm_mul_ps(a2Val, b2Val);
154  c3Val = _mm_mul_ps(a3Val, b3Val);
155 
156  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
157  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
158  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
159  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
160 
161  aPtr += 8;
162  bPtr += 16;
163  }
164 
165  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
166  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
167  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
168 
169  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
170 
171  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
172 
173  *realpt = dotProductVector[0];
174  *imagpt = dotProductVector[1];
175  *realpt += dotProductVector[2];
176  *imagpt += dotProductVector[3];
177 
178  number = sixteenthPoints*8;
179  for(;number < num_points; number++){
180  *realpt += ((*aPtr) * (*bPtr++));
181  *imagpt += ((*aPtr++) * (*bPtr++));
182  }
183 
184  *result = *(lv_32fc_t*)(&res[0]);
185 }
186 
187 #endif /*LV_HAVE_SSE && LV_HAVE_MMX*/
188 
189 
190 
191 
192 #if LV_HAVE_SSE && LV_HAVE_MMX
193 
194 
195 static inline void volk_16i_32fc_dot_prod_32fc_a_sse( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) {
196 
197  unsigned int number = 0;
198  const unsigned int sixteenthPoints = num_points / 8;
199 
200  float res[2];
201  float *realpt = &res[0], *imagpt = &res[1];
202  const short* aPtr = input;
203  const float* bPtr = (float*)taps;
204 
205  __m64 m0, m1;
206  __m128 f0, f1, f2, f3;
207  __m128 a0Val, a1Val, a2Val, a3Val;
208  __m128 b0Val, b1Val, b2Val, b3Val;
209  __m128 c0Val, c1Val, c2Val, c3Val;
210 
211  __m128 dotProdVal0 = _mm_setzero_ps();
212  __m128 dotProdVal1 = _mm_setzero_ps();
213  __m128 dotProdVal2 = _mm_setzero_ps();
214  __m128 dotProdVal3 = _mm_setzero_ps();
215 
216  for(;number < sixteenthPoints; number++){
217 
218  m0 = _mm_set_pi16(*(aPtr+3), *(aPtr+2), *(aPtr+1), *(aPtr+0));
219  m1 = _mm_set_pi16(*(aPtr+7), *(aPtr+6), *(aPtr+5), *(aPtr+4));
220  f0 = _mm_cvtpi16_ps(m0);
221  f1 = _mm_cvtpi16_ps(m0);
222  f2 = _mm_cvtpi16_ps(m1);
223  f3 = _mm_cvtpi16_ps(m1);
224 
225  a0Val = _mm_unpacklo_ps(f0, f1);
226  a1Val = _mm_unpackhi_ps(f0, f1);
227  a2Val = _mm_unpacklo_ps(f2, f3);
228  a3Val = _mm_unpackhi_ps(f2, f3);
229 
230  b0Val = _mm_load_ps(bPtr);
231  b1Val = _mm_load_ps(bPtr+4);
232  b2Val = _mm_load_ps(bPtr+8);
233  b3Val = _mm_load_ps(bPtr+12);
234 
235  c0Val = _mm_mul_ps(a0Val, b0Val);
236  c1Val = _mm_mul_ps(a1Val, b1Val);
237  c2Val = _mm_mul_ps(a2Val, b2Val);
238  c3Val = _mm_mul_ps(a3Val, b3Val);
239 
240  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
241  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
242  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
243  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
244 
245  aPtr += 8;
246  bPtr += 16;
247  }
248 
249  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
250  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
251  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
252 
253  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
254 
255  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
256 
257  *realpt = dotProductVector[0];
258  *imagpt = dotProductVector[1];
259  *realpt += dotProductVector[2];
260  *imagpt += dotProductVector[3];
261 
262  number = sixteenthPoints*8;
263  for(;number < num_points; number++){
264  *realpt += ((*aPtr) * (*bPtr++));
265  *imagpt += ((*aPtr++) * (*bPtr++));
266  }
267 
268  *result = *(lv_32fc_t*)(&res[0]);
269 }
270 
271 #endif /*LV_HAVE_SSE && LV_HAVE_MMX*/
272 
273 
274 #endif /*INCLUDED_volk_16i_32fc_dot_prod_32fc_H*/
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:27
static const float taps[NSTEPS+1][NTAPS]
Definition: interpolator_taps.h:9
float complex lv_32fc_t
Definition: volk_complex.h:56