GNU Radio Manual and C++ API Reference  3.7.6.1
The Free & Open Software Radio Ecosystem
volk_32fc_x2_multiply_conjugate_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 #ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H
24 #define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H
25 
26 #include <inttypes.h>
27 #include <stdio.h>
28 #include <volk/volk_complex.h>
29 #include <float.h>
30 
31 #ifdef LV_HAVE_AVX
32 #include <immintrin.h>
33  /*!
34  \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector
35  \param cVector The vector where the results will be stored
36  \param aVector First vector to be multiplied
37  \param bVector Second vector that is conjugated before being multiplied
38  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
39  */
40 static inline void volk_32fc_x2_multiply_conjugate_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
41  unsigned int number = 0;
42  const unsigned int quarterPoints = num_points / 4;
43 
44  __m256 x, y, yl, yh, z, tmp1, tmp2;
45  lv_32fc_t* c = cVector;
46  const lv_32fc_t* a = aVector;
47  const lv_32fc_t* b = bVector;
48 
49  __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
50 
51  for(;number < quarterPoints; number++){
52 
53  x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
54  y = _mm256_loadu_ps((float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
55 
56  y = _mm256_xor_ps(y, conjugator); // conjugate y
57 
58  yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ...
59  yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ...
60 
61  tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
62 
63  x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ...
64 
65  tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
66 
67  z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ...
68 
69  _mm256_storeu_ps((float*)c,z); // Store the results back into the C container
70 
71  a += 4;
72  b += 4;
73  c += 4;
74  }
75 
76  number = quarterPoints * 4;
77 
78  for(; number < num_points; number++) {
79  *c++ = (*a++) * lv_conj(*b++);
80  }
81 }
82 #endif /* LV_HAVE_AVX */
83 
84 #ifdef LV_HAVE_SSE3
85 #include <pmmintrin.h>
86  /*!
87  \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector
88  \param cVector The vector where the results will be stored
89  \param aVector First vector to be multiplied
90  \param bVector Second vector that is conjugated before being multiplied
91  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
92  */
93 static inline void volk_32fc_x2_multiply_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
94  unsigned int number = 0;
95  const unsigned int halfPoints = num_points / 2;
96 
97  __m128 x, y, yl, yh, z, tmp1, tmp2;
98  lv_32fc_t* c = cVector;
99  const lv_32fc_t* a = aVector;
100  const lv_32fc_t* b = bVector;
101 
102  __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
103 
104  for(;number < halfPoints; number++){
105 
106  x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
107  y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
108 
109  y = _mm_xor_ps(y, conjugator); // conjugate y
110 
111  yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
112  yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
113 
114  tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
115 
116  x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
117 
118  tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
119 
120  z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
121 
122  _mm_storeu_ps((float*)c,z); // Store the results back into the C container
123 
124  a += 2;
125  b += 2;
126  c += 2;
127  }
128 
129  if((num_points % 2) != 0) {
130  *c = (*a) * lv_conj(*b);
131  }
132 }
133 #endif /* LV_HAVE_SSE */
134 
135 #ifdef LV_HAVE_GENERIC
136  /*!
137  \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector
138  \param cVector The vector where the results will be stored
139  \param aVector First vector to be multiplied
140  \param bVector Second vector that is conjugated before being multiplied
141  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
142  */
143 static inline void volk_32fc_x2_multiply_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
144  lv_32fc_t* cPtr = cVector;
145  const lv_32fc_t* aPtr = aVector;
146  const lv_32fc_t* bPtr= bVector;
147  unsigned int number = 0;
148 
149  for(number = 0; number < num_points; number++){
150  *cPtr++ = (*aPtr++) * lv_conj(*bPtr++);
151  }
152 }
153 #endif /* LV_HAVE_GENERIC */
154 
155 
156 #endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H */
157 #ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H
158 #define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H
159 
160 #include <inttypes.h>
161 #include <stdio.h>
162 #include <volk/volk_complex.h>
163 #include <float.h>
164 
165 #ifdef LV_HAVE_AVX
166 #include <immintrin.h>
167  /*!
168  \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector
169  \param cVector The vector where the results will be stored
170  \param aVector First vector to be multiplied
171  \param bVector Second vector that is conjugated before being multiplied
172  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
173  */
174 static inline void volk_32fc_x2_multiply_conjugate_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
175  unsigned int number = 0;
176  const unsigned int quarterPoints = num_points / 4;
177 
178  __m256 x, y, yl, yh, z, tmp1, tmp2;
179  lv_32fc_t* c = cVector;
180  const lv_32fc_t* a = aVector;
181  const lv_32fc_t* b = bVector;
182 
183  __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
184 
185  for(;number < quarterPoints; number++){
186 
187  x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
188  y = _mm256_load_ps((float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
189 
190  y = _mm256_xor_ps(y, conjugator); // conjugate y
191 
192  yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ...
193  yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ...
194 
195  tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
196 
197  x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ...
198 
199  tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
200 
201  z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ...
202 
203  _mm256_store_ps((float*)c,z); // Store the results back into the C container
204 
205  a += 4;
206  b += 4;
207  c += 4;
208  }
209 
210  number = quarterPoints * 4;
211 
212  for(; number < num_points; number++) {
213  *c++ = (*a++) * lv_conj(*b++);
214  }
215 }
216 #endif /* LV_HAVE_AVX */
217 
218 #ifdef LV_HAVE_SSE3
219 #include <pmmintrin.h>
220  /*!
221  \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector
222  \param cVector The vector where the results will be stored
223  \param aVector First vector to be multiplied
224  \param bVector Second vector that is conjugated before being multiplied
225  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
226  */
227 static inline void volk_32fc_x2_multiply_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
228  unsigned int number = 0;
229  const unsigned int halfPoints = num_points / 2;
230 
231  __m128 x, y, yl, yh, z, tmp1, tmp2;
232  lv_32fc_t* c = cVector;
233  const lv_32fc_t* a = aVector;
234  const lv_32fc_t* b = bVector;
235 
236  __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
237 
238  for(;number < halfPoints; number++){
239 
240  x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
241  y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
242 
243  y = _mm_xor_ps(y, conjugator); // conjugate y
244 
245  yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
246  yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
247 
248  tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
249 
250  x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
251 
252  tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
253 
254  z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
255 
256  _mm_store_ps((float*)c,z); // Store the results back into the C container
257 
258  a += 2;
259  b += 2;
260  c += 2;
261  }
262 
263  if((num_points % 2) != 0) {
264  *c = (*a) * lv_conj(*b);
265  }
266 }
267 #endif /* LV_HAVE_SSE */
268 
269 #ifdef LV_HAVE_NEON
270 #include <arm_neon.h>
271  /*!
272  \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector
273  \param cVector The vector where the results will be stored
274  \param aVector First vector to be multiplied
275  \param bVector Second vector that is conjugated before being multiplied
276  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
277  */
278 static inline void volk_32fc_x2_multiply_conjugate_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
279 
280  lv_32fc_t *a_ptr = (lv_32fc_t*) aVector;
281  lv_32fc_t *b_ptr = (lv_32fc_t*) bVector;
282  unsigned int quarter_points = num_points / 4;
283  float32x4x2_t a_val, b_val, c_val;
284  float32x4x2_t tmp_real, tmp_imag;
285  unsigned int number = 0;
286 
287  for(number = 0; number < quarter_points; ++number) {
288  a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
289  b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
290  b_val.val[1] = vnegq_f32(b_val.val[1]);
291  __builtin_prefetch(a_ptr+4);
292  __builtin_prefetch(b_ptr+4);
293 
294  // multiply the real*real and imag*imag to get real result
295  // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
296  tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
297  // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
298  tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
299 
300  // Multiply cross terms to get the imaginary result
301  // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
302  tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
303  // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
304  tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
305 
306  // store the results
307  c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
308  c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
309  vst2q_f32((float*)cVector, c_val);
310 
311  a_ptr += 4;
312  b_ptr += 4;
313  cVector += 4;
314  }
315 
316  for(number = quarter_points*4; number < num_points; number++){
317  *cVector++ = (*a_ptr++) * conj(*b_ptr++);
318  }
319 }
320 #endif /* LV_HAVE_NEON */
321 
322 #ifdef LV_HAVE_GENERIC
323  /*!
324  \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector
325  \param cVector The vector where the results will be stored
326  \param aVector First vector to be multiplied
327  \param bVector Second vector that is conjugated before being multiplied
328  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
329  */
330 static inline void volk_32fc_x2_multiply_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
331  lv_32fc_t* cPtr = cVector;
332  const lv_32fc_t* aPtr = aVector;
333  const lv_32fc_t* bPtr= bVector;
334  unsigned int number = 0;
335 
336  for(number = 0; number < num_points; number++){
337  *cPtr++ = (*aPtr++) * lv_conj(*bPtr++);
338  }
339 }
340 #endif /* LV_HAVE_GENERIC */
341 
342 
343 #endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H */
#define lv_conj(x)
Definition: volk_complex.h:80
float complex lv_32fc_t
Definition: volk_complex.h:56