GNU Radio Manual and C++ API Reference  3.7.6.1
The Free & Open Software Radio Ecosystem
volk_32fc_x2_multiply_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 #ifndef INCLUDED_volk_32fc_x2_multiply_32fc_u_H
24 #define INCLUDED_volk_32fc_x2_multiply_32fc_u_H
25 
26 #include <inttypes.h>
27 #include <stdio.h>
28 #include <volk/volk_complex.h>
29 #include <float.h>
30 
31 #ifdef LV_HAVE_AVX
32 #include <immintrin.h>
33  /*!
34  \brief Multiplies the two input complex vectors and stores their results in the third vector
35  \param cVector The vector where the results will be stored
36  \param aVector One of the vectors to be multiplied
37  \param bVector One of the vectors to be multiplied
38  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
39  */
40 static inline void volk_32fc_x2_multiply_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
41  unsigned int number = 0;
42  const unsigned int quarterPoints = num_points / 4;
43 
44  __m256 x, y, yl, yh, z, tmp1, tmp2;
45  lv_32fc_t* c = cVector;
46  const lv_32fc_t* a = aVector;
47  const lv_32fc_t* b = bVector;
48 
49  for(;number < quarterPoints; number++){
50 
51  x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
52  y = _mm256_loadu_ps((float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
53 
54  yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ...
55  yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ...
56 
57  tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
58 
59  x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ...
60 
61  tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
62 
63  z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
64 
65  _mm256_storeu_ps((float*)c,z); // Store the results back into the C container
66 
67  a += 4;
68  b += 4;
69  c += 4;
70  }
71 
72  number = quarterPoints * 4;
73 
74  for(; number < num_points; number++) {
75  *c++ = (*a++) * (*b++);
76  }
77 }
78 #endif /* LV_HAVE_AVX */
79 
80 #ifdef LV_HAVE_SSE3
81 #include <pmmintrin.h>
82  /*!
83  \brief Multiplies the two input complex vectors and stores their results in the third vector
84  \param cVector The vector where the results will be stored
85  \param aVector One of the vectors to be multiplied
86  \param bVector One of the vectors to be multiplied
87  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
88  */
89 static inline void volk_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
90  unsigned int number = 0;
91  const unsigned int halfPoints = num_points / 2;
92 
93  __m128 x, y, yl, yh, z, tmp1, tmp2;
94  lv_32fc_t* c = cVector;
95  const lv_32fc_t* a = aVector;
96  const lv_32fc_t* b = bVector;
97 
98  for(;number < halfPoints; number++){
99 
100  x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
101  y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
102 
103  yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
104  yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
105 
106  tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
107 
108  x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
109 
110  tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
111 
112  z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
113 
114  _mm_storeu_ps((float*)c,z); // Store the results back into the C container
115 
116  a += 2;
117  b += 2;
118  c += 2;
119  }
120 
121  if((num_points % 2) != 0) {
122  *c = (*a) * (*b);
123  }
124 }
125 #endif /* LV_HAVE_SSE */
126 
127 #ifdef LV_HAVE_GENERIC
128  /*!
129  \brief Multiplies the two input complex vectors and stores their results in the third vector
130  \param cVector The vector where the results will be stored
131  \param aVector One of the vectors to be multiplied
132  \param bVector One of the vectors to be multiplied
133  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
134  */
135 static inline void volk_32fc_x2_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
136  lv_32fc_t* cPtr = cVector;
137  const lv_32fc_t* aPtr = aVector;
138  const lv_32fc_t* bPtr= bVector;
139  unsigned int number = 0;
140 
141  for(number = 0; number < num_points; number++){
142  *cPtr++ = (*aPtr++) * (*bPtr++);
143  }
144 }
145 #endif /* LV_HAVE_GENERIC */
146 
147 
148 #endif /* INCLUDED_volk_32fc_x2_multiply_32fc_u_H */
149 #ifndef INCLUDED_volk_32fc_x2_multiply_32fc_a_H
150 #define INCLUDED_volk_32fc_x2_multiply_32fc_a_H
151 
152 #include <inttypes.h>
153 #include <stdio.h>
154 #include <volk/volk_complex.h>
155 #include <float.h>
156 
157 #ifdef LV_HAVE_AVX
158 #include <immintrin.h>
159  /*!
160  \brief Multiplies the two input complex vectors and stores their results in the third vector
161  \param cVector The vector where the results will be stored
162  \param aVector One of the vectors to be multiplied
163  \param bVector One of the vectors to be multiplied
164  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
165  */
166 static inline void volk_32fc_x2_multiply_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
167  unsigned int number = 0;
168  const unsigned int quarterPoints = num_points / 4;
169 
170  __m256 x, y, yl, yh, z, tmp1, tmp2;
171  lv_32fc_t* c = cVector;
172  const lv_32fc_t* a = aVector;
173  const lv_32fc_t* b = bVector;
174 
175  for(;number < quarterPoints; number++){
176 
177  x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
178  y = _mm256_load_ps((float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
179 
180  yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ...
181  yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ...
182 
183  tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
184 
185  x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ...
186 
187  tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
188 
189  z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
190 
191  _mm256_store_ps((float*)c,z); // Store the results back into the C container
192 
193  a += 4;
194  b += 4;
195  c += 4;
196  }
197 
198  number = quarterPoints * 4;
199 
200  for(; number < num_points; number++) {
201  *c++ = (*a++) * (*b++);
202  }
203 }
204 #endif /* LV_HAVE_AVX */
205 
206 #ifdef LV_HAVE_SSE3
207 #include <pmmintrin.h>
208  /*!
209  \brief Multiplies the two input complex vectors and stores their results in the third vector
210  \param cVector The vector where the results will be stored
211  \param aVector One of the vectors to be multiplied
212  \param bVector One of the vectors to be multiplied
213  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
214  */
215 static inline void volk_32fc_x2_multiply_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
216  unsigned int number = 0;
217  const unsigned int halfPoints = num_points / 2;
218 
219  __m128 x, y, yl, yh, z, tmp1, tmp2;
220  lv_32fc_t* c = cVector;
221  const lv_32fc_t* a = aVector;
222  const lv_32fc_t* b = bVector;
223  for(;number < halfPoints; number++){
224 
225  x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
226  y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
227 
228  yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
229  yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
230 
231  tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
232 
233  x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
234 
235  tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
236 
237  z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
238 
239  _mm_store_ps((float*)c,z); // Store the results back into the C container
240 
241  a += 2;
242  b += 2;
243  c += 2;
244  }
245 
246  if((num_points % 2) != 0) {
247  *c = (*a) * (*b);
248  }
249 }
250 #endif /* LV_HAVE_SSE */
251 
252 #ifdef LV_HAVE_GENERIC
253  /*!
254  \brief Multiplies the two input complex vectors and stores their results in the third vector
255  \param cVector The vector where the results will be stored
256  \param aVector One of the vectors to be multiplied
257  \param bVector One of the vectors to be multiplied
258  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
259  */
260 static inline void volk_32fc_x2_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
261  lv_32fc_t* cPtr = cVector;
262  const lv_32fc_t* aPtr = aVector;
263  const lv_32fc_t* bPtr= bVector;
264  unsigned int number = 0;
265 
266  for(number = 0; number < num_points; number++){
267  *cPtr++ = (*aPtr++) * (*bPtr++);
268  }
269 }
270 #endif /* LV_HAVE_GENERIC */
271 
272 #ifdef LV_HAVE_NEON
273 #include <arm_neon.h>
274 
275  /*!
276  \brief Multiplies the two input complex vectors and stores their results in the third vector
277  \param cVector The vector where the results will be stored
278  \param aVector One of the vectors to be multiplied
279  \param bVector One of the vectors to be multiplied
280  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
281  */
282 static inline void volk_32fc_x2_multiply_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
283 
284  lv_32fc_t *a_ptr = (lv_32fc_t*) aVector;
285  lv_32fc_t *b_ptr = (lv_32fc_t*) bVector;
286  unsigned int quarter_points = num_points / 4;
287  float32x4x2_t a_val, b_val, c_val;
288  float32x4x2_t tmp_real, tmp_imag;
289  unsigned int number = 0;
290 
291  for(number = 0; number < quarter_points; ++number) {
292  a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
293  b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
294  __builtin_prefetch(a_ptr+4);
295  __builtin_prefetch(b_ptr+4);
296 
297  // multiply the real*real and imag*imag to get real result
298  // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
299  tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
300  // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
301  tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
302 
303  // Multiply cross terms to get the imaginary result
304  // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
305  tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
306  // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
307  tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
308 
309  // store the results
310  c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
311  c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
312  vst2q_f32((float*)cVector, c_val);
313 
314  a_ptr += 4;
315  b_ptr += 4;
316  cVector += 4;
317  }
318 
319  for(number = quarter_points*4; number < num_points; number++){
320  *cVector++ = (*a_ptr++) * (*b_ptr++);
321  }
322 
323 }
324 #endif /* LV_HAVE_NEON */
325 
326 #ifdef LV_HAVE_NEON
327  /*!
328  \brief Multiplies the two input complex vectors and stores their results in the third vector
329  \param cVector The vector where the results will be stored
330  \param aVector One of the vectors to be multiplied
331  \param bVector One of the vectors to be multiplied
332  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
333  */
334 static inline void volk_32fc_x2_multiply_32fc_neon_opttests(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
335 
336  lv_32fc_t *a_ptr = (lv_32fc_t*) aVector;
337  lv_32fc_t *b_ptr = (lv_32fc_t*) bVector;
338  unsigned int quarter_points = num_points / 4;
339  float32x4x2_t a_val, b_val;
340  float32x4x2_t tmp_imag;
341  unsigned int number = 0;
342 
343  for(number = 0; number < quarter_points; ++number) {
344  a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
345  b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
346  __builtin_prefetch(a_ptr+4);
347  __builtin_prefetch(b_ptr+4);
348 
349  // do the first multiply
350  tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
351  tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
352 
353  // use multiply accumulate/subtract to get result
354  tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]);
355  tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]);
356 
357  // store
358  vst2q_f32((float*)cVector, tmp_imag);
359  // increment pointers
360  a_ptr += 4;
361  b_ptr += 4;
362  cVector += 4;
363  }
364 
365  for(number = quarter_points*4; number < num_points; number++){
366  *cVector++ = (*a_ptr++) * (*b_ptr++);
367  }
368 
369 }
370 #endif /* LV_HAVE_NEON */
371 
372 #ifdef LV_HAVE_NEON
373  /*!
374  \brief Multiplies the two input complex vectors and stores their results in the third vector
375  \param cVector The vector where the results will be stored
376  \param aVector One of the vectors to be multiplied
377  \param bVector One of the vectors to be multiplied
378  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
379  */
380 extern void volk_32fc_x2_multiply_32fc_neonasm(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points);
381 #endif /* LV_HAVE_NEON */
382 
383 #ifdef LV_HAVE_ORC
384  /*!
385  \brief Multiplies the two input complex vectors and stores their results in the third vector
386  \param cVector The vector where the results will be stored
387  \param aVector One of the vectors to be multiplied
388  \param bVector One of the vectors to be multiplied
389  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
390  */
391 extern void volk_32fc_x2_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points);
392 static inline void volk_32fc_x2_multiply_32fc_u_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
393  volk_32fc_x2_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
394 }
395 #endif /* LV_HAVE_ORC */
396 
397 
398 
399 
400 
401 #endif /* INCLUDED_volk_32fc_x2_multiply_32fc_a_H */
float complex lv_32fc_t
Definition: volk_complex.h:56