GNU Radio Manual and C++ API Reference  3.7.6.1
The Free & Open Software Radio Ecosystem
volk_32fc_x2_conjugate_dot_prod_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 #ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H
24 #define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H
25 
26 
27 #include<volk/volk_complex.h>
28 
29 
30 #ifdef LV_HAVE_GENERIC
31 
32 
33 static inline void volk_32fc_x2_conjugate_dot_prod_32fc_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
34 
35  const unsigned int num_bytes = num_points*8;
36 
37  float * res = (float*) result;
38  float * in = (float*) input;
39  float * tp = (float*) taps;
40  unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
41  unsigned int isodd = (num_bytes >> 3) &1;
42 
43 
44 
45  float sum0[2] = {0,0};
46  float sum1[2] = {0,0};
47  unsigned int i = 0;
48 
49 
50  for(i = 0; i < n_2_ccomplex_blocks; ++i) {
51 
52  sum0[0] += in[0] * tp[0] + in[1] * tp[1];
53  sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0];
54  sum1[0] += in[2] * tp[2] + in[3] * tp[3];
55  sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2];
56 
57 
58  in += 4;
59  tp += 4;
60 
61  }
62 
63 
64  res[0] = sum0[0] + sum1[0];
65  res[1] = sum0[1] + sum1[1];
66 
67 
68 
69  for(i = 0; i < isodd; ++i) {
70 
71 
72  *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]);
73 
74  }
75  /*
76  for(i = 0; i < num_bytes >> 3; ++i) {
77  *result += input[i] * conjf(taps[i]);
78  }
79  */
80 }
81 
82 #endif /*LV_HAVE_GENERIC*/
83 
84 #ifdef LV_HAVE_SSE3
85 
86 #include <xmmintrin.h>
87 #include <pmmintrin.h>
88 #include <mmintrin.h>
89 
90 
91 static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
92 
93  unsigned int num_bytes = num_points*8;
94 
95  // Variable never used?
96  //__VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
97 
98  union HalfMask {
99  uint32_t intRep[4];
100  __m128 vec;
101  } halfMask;
102 
103  union NegMask {
104  int intRep[4];
105  __m128 vec;
106  } negMask;
107 
108  unsigned int offset = 0;
109  float Rsum=0, Isum=0;
110  float Im,Re;
111 
112  __m128 in1, in2, Rv, fehg, Iv, Rs, Ivm, Is;
113  __m128 zv = {0,0,0,0};
114 
115  halfMask.intRep[0] = halfMask.intRep[1] = 0xFFFFFFFF;
116  halfMask.intRep[2] = halfMask.intRep[3] = 0x00000000;
117 
118  negMask.intRep[0] = negMask.intRep[2] = 0x80000000;
119  negMask.intRep[1] = negMask.intRep[3] = 0;
120 
121  // main loop
122  while(num_bytes >= 4*sizeof(float)){
123 
124  in1 = _mm_loadu_ps( (float*) (input+offset) );
125  in2 = _mm_loadu_ps( (float*) (taps+offset) );
126  Rv = _mm_mul_ps(in1, in2);
127  fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1));
128  Iv = _mm_mul_ps(in1, fehg);
129  Rs = _mm_hadd_ps( _mm_hadd_ps(Rv, zv) ,zv);
130  Ivm = _mm_xor_ps( negMask.vec, Iv );
131  Is = _mm_hadd_ps( _mm_hadd_ps(Ivm, zv) ,zv);
132  _mm_store_ss( &Im, Is );
133  _mm_store_ss( &Re, Rs );
134  num_bytes -= 4*sizeof(float);
135  offset += 2;
136  Rsum += Re;
137  Isum += Im;
138  }
139 
140  // handle the last complex case ...
141  if(num_bytes > 0){
142 
143  if(num_bytes != 4){
144  // bad things are happening
145  }
146 
147  in1 = _mm_loadu_ps( (float*) (input+offset) );
148  in2 = _mm_loadu_ps( (float*) (taps+offset) );
149  Rv = _mm_and_ps(_mm_mul_ps(in1, in2), halfMask.vec);
150  fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1));
151  Iv = _mm_and_ps(_mm_mul_ps(in1, fehg), halfMask.vec);
152  Rs = _mm_hadd_ps(_mm_hadd_ps(Rv, zv),zv);
153  Ivm = _mm_xor_ps( negMask.vec, Iv );
154  Is = _mm_hadd_ps(_mm_hadd_ps(Ivm, zv),zv);
155  _mm_store_ss( &Im, Is );
156  _mm_store_ss( &Re, Rs );
157  Rsum += Re;
158  Isum += Im;
159  }
160 
161  result[0] = lv_cmake(Rsum,Isum);
162  return;
163 }
164 
165 #endif /*LV_HAVE_SSE3*/
166 
167 #ifdef LV_HAVE_NEON
168 #include <arm_neon.h>
169 static inline void volk_32fc_x2_conjugate_dot_prod_32fc_neon(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
170 
171  unsigned int quarter_points = num_points / 4;
172  unsigned int number;
173 
174  lv_32fc_t* a_ptr = (lv_32fc_t*) taps;
175  lv_32fc_t* b_ptr = (lv_32fc_t*) input;
176  // for 2-lane vectors, 1st lane holds the real part,
177  // 2nd lane holds the imaginary part
178  float32x4x2_t a_val, b_val, accumulator;
179  float32x4x2_t tmp_imag;
180  accumulator.val[0] = vdupq_n_f32(0);
181  accumulator.val[1] = vdupq_n_f32(0);
182 
183  for(number = 0; number < quarter_points; ++number) {
184  a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
185  b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
186  __builtin_prefetch(a_ptr+8);
187  __builtin_prefetch(b_ptr+8);
188 
189  // do the first multiply
190  tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
191  tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
192 
193  // use multiply accumulate/subtract to get result
194  tmp_imag.val[1] = vmlsq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]);
195  tmp_imag.val[0] = vmlaq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]);
196 
197  accumulator.val[0] = vaddq_f32(accumulator.val[0], tmp_imag.val[0]);
198  accumulator.val[1] = vaddq_f32(accumulator.val[1], tmp_imag.val[1]);
199 
200  // increment pointers
201  a_ptr += 4;
202  b_ptr += 4;
203  }
204  lv_32fc_t accum_result[4];
205  vst2q_f32((float*)accum_result, accumulator);
206  *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
207 
208  // tail case
209  for(number = quarter_points*4; number < num_points; ++number) {
210  *result += (*a_ptr++) * lv_conj(*b_ptr++);
211  }
212  *result = lv_conj(*result);
213 
214 }
215 #endif /*LV_HAVE_NEON*/
216 
217 #endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H*/
218 
219 #ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H
220 #define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H
221 
222 #include <volk/volk_common.h>
223 #include<volk/volk_complex.h>
224 #include<stdio.h>
225 
226 
227 #ifdef LV_HAVE_GENERIC
228 
229 
230 static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
231 
232  const unsigned int num_bytes = num_points*8;
233 
234  float * res = (float*) result;
235  float * in = (float*) input;
236  float * tp = (float*) taps;
237  unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
238  unsigned int isodd = (num_bytes >> 3) &1;
239 
240 
241 
242  float sum0[2] = {0,0};
243  float sum1[2] = {0,0};
244  unsigned int i = 0;
245 
246 
247  for(i = 0; i < n_2_ccomplex_blocks; ++i) {
248 
249 
250  sum0[0] += in[0] * tp[0] + in[1] * tp[1];
251  sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0];
252  sum1[0] += in[2] * tp[2] + in[3] * tp[3];
253  sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2];
254 
255 
256  in += 4;
257  tp += 4;
258 
259  }
260 
261 
262  res[0] = sum0[0] + sum1[0];
263  res[1] = sum0[1] + sum1[1];
264 
265 
266 
267  for(i = 0; i < isodd; ++i) {
268 
269 
270  *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]);
271 
272  }
273  /*
274  for(i = 0; i < num_bytes >> 3; ++i) {
275  *result += input[i] * conjf(taps[i]);
276  }
277  */
278 }
279 
280 #endif /*LV_HAVE_GENERIC*/
281 
282 
283 #if LV_HAVE_SSE && LV_HAVE_64
284 
285 
286 static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
287 
288  const unsigned int num_bytes = num_points*8;
289 
290  __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
291 
292 
293 
294 
295  asm volatile
296  (
297  "# ccomplex_conjugate_dotprod_generic (float* result, const float *input,\n\t"
298  "# const float *taps, unsigned num_bytes)\n\t"
299  "# float sum0 = 0;\n\t"
300  "# float sum1 = 0;\n\t"
301  "# float sum2 = 0;\n\t"
302  "# float sum3 = 0;\n\t"
303  "# do {\n\t"
304  "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
305  "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
306  "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
307  "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
308  "# input += 4;\n\t"
309  "# taps += 4; \n\t"
310  "# } while (--n_2_ccomplex_blocks != 0);\n\t"
311  "# result[0] = sum0 + sum2;\n\t"
312  "# result[1] = sum1 + sum3;\n\t"
313  "# TODO: prefetch and better scheduling\n\t"
314  " xor %%r9, %%r9\n\t"
315  " xor %%r10, %%r10\n\t"
316  " movq %[conjugator], %%r9\n\t"
317  " movq %%rcx, %%rax\n\t"
318  " movaps 0(%%r9), %%xmm8\n\t"
319  " movq %%rcx, %%r8\n\t"
320  " movq %[rsi], %%r9\n\t"
321  " movq %[rdx], %%r10\n\t"
322  " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
323  " movaps 0(%%r9), %%xmm0\n\t"
324  " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
325  " movups 0(%%r10), %%xmm2\n\t"
326  " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t"
327  " shr $4, %%r8\n\t"
328  " xorps %%xmm8, %%xmm2\n\t"
329  " jmp .%=L1_test\n\t"
330  " # 4 taps / loop\n\t"
331  " # something like ?? cycles / loop\n\t"
332  ".%=Loop1: \n\t"
333  "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
334  "# movaps (%%r9), %%xmmA\n\t"
335  "# movaps (%%r10), %%xmmB\n\t"
336  "# movaps %%xmmA, %%xmmZ\n\t"
337  "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
338  "# mulps %%xmmB, %%xmmA\n\t"
339  "# mulps %%xmmZ, %%xmmB\n\t"
340  "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
341  "# xorps %%xmmPN, %%xmmA\n\t"
342  "# movaps %%xmmA, %%xmmZ\n\t"
343  "# unpcklps %%xmmB, %%xmmA\n\t"
344  "# unpckhps %%xmmB, %%xmmZ\n\t"
345  "# movaps %%xmmZ, %%xmmY\n\t"
346  "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
347  "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
348  "# addps %%xmmZ, %%xmmA\n\t"
349  "# addps %%xmmA, %%xmmC\n\t"
350  "# A=xmm0, B=xmm2, Z=xmm4\n\t"
351  "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
352  " movaps 16(%%r9), %%xmm1\n\t"
353  " movaps %%xmm0, %%xmm4\n\t"
354  " mulps %%xmm2, %%xmm0\n\t"
355  " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
356  " movaps 16(%%r10), %%xmm3\n\t"
357  " movaps %%xmm1, %%xmm5\n\t"
358  " xorps %%xmm8, %%xmm3\n\t"
359  " addps %%xmm0, %%xmm6\n\t"
360  " mulps %%xmm3, %%xmm1\n\t"
361  " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
362  " addps %%xmm1, %%xmm6\n\t"
363  " mulps %%xmm4, %%xmm2\n\t"
364  " movaps 32(%%r9), %%xmm0\n\t"
365  " addps %%xmm2, %%xmm7\n\t"
366  " mulps %%xmm5, %%xmm3\n\t"
367  " add $32, %%r9\n\t"
368  " movaps 32(%%r10), %%xmm2\n\t"
369  " addps %%xmm3, %%xmm7\n\t"
370  " add $32, %%r10\n\t"
371  " xorps %%xmm8, %%xmm2\n\t"
372  ".%=L1_test:\n\t"
373  " dec %%rax\n\t"
374  " jge .%=Loop1\n\t"
375  " # We've handled the bulk of multiplies up to here.\n\t"
376  " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
377  " # If so, we've got 2 more taps to do.\n\t"
378  " and $1, %%r8\n\t"
379  " je .%=Leven\n\t"
380  " # The count was odd, do 2 more taps.\n\t"
381  " # Note that we've already got mm0/mm2 preloaded\n\t"
382  " # from the main loop.\n\t"
383  " movaps %%xmm0, %%xmm4\n\t"
384  " mulps %%xmm2, %%xmm0\n\t"
385  " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
386  " addps %%xmm0, %%xmm6\n\t"
387  " mulps %%xmm4, %%xmm2\n\t"
388  " addps %%xmm2, %%xmm7\n\t"
389  ".%=Leven:\n\t"
390  " # neg inversor\n\t"
391  " xorps %%xmm1, %%xmm1\n\t"
392  " mov $0x80000000, %%r9\n\t"
393  " movd %%r9, %%xmm1\n\t"
394  " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
395  " # pfpnacc\n\t"
396  " xorps %%xmm1, %%xmm6\n\t"
397  " movaps %%xmm6, %%xmm2\n\t"
398  " unpcklps %%xmm7, %%xmm6\n\t"
399  " unpckhps %%xmm7, %%xmm2\n\t"
400  " movaps %%xmm2, %%xmm3\n\t"
401  " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
402  " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
403  " addps %%xmm2, %%xmm6\n\t"
404  " # xmm6 = r1 i2 r3 i4\n\t"
405  " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
406  " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
407  " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t"
408  :
409  :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result), [conjugator] "r" (conjugator)
410  :"rax", "r8", "r9", "r10"
411  );
412 
413 
414  int getem = num_bytes % 16;
415 
416 
417  for(; getem > 0; getem -= 8) {
418 
419 
420  *result += (input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]));
421 
422  }
423 
424  return;
425 }
426 #endif
427 
428 #if LV_HAVE_SSE && LV_HAVE_32
429 static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
430 
431  const unsigned int num_bytes = num_points*8;
432 
433  __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
434 
435  int bound = num_bytes >> 4;
436  int leftovers = num_bytes % 16;
437 
438 
439  asm volatile
440  (
441  " #pushl %%ebp\n\t"
442  " #movl %%esp, %%ebp\n\t"
443  " #movl 12(%%ebp), %%eax # input\n\t"
444  " #movl 16(%%ebp), %%edx # taps\n\t"
445  " #movl 20(%%ebp), %%ecx # n_bytes\n\t"
446  " movaps 0(%[conjugator]), %%xmm1\n\t"
447  " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
448  " movaps 0(%[eax]), %%xmm0\n\t"
449  " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
450  " movaps 0(%[edx]), %%xmm2\n\t"
451  " movl %[ecx], (%[out])\n\t"
452  " shrl $5, %[ecx] # ecx = n_2_ccomplex_blocks / 2\n\t"
453 
454  " xorps %%xmm1, %%xmm2\n\t"
455  " jmp .%=L1_test\n\t"
456  " # 4 taps / loop\n\t"
457  " # something like ?? cycles / loop\n\t"
458  ".%=Loop1: \n\t"
459  "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
460  "# movaps (%[eax]), %%xmmA\n\t"
461  "# movaps (%[edx]), %%xmmB\n\t"
462  "# movaps %%xmmA, %%xmmZ\n\t"
463  "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
464  "# mulps %%xmmB, %%xmmA\n\t"
465  "# mulps %%xmmZ, %%xmmB\n\t"
466  "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
467  "# xorps %%xmmPN, %%xmmA\n\t"
468  "# movaps %%xmmA, %%xmmZ\n\t"
469  "# unpcklps %%xmmB, %%xmmA\n\t"
470  "# unpckhps %%xmmB, %%xmmZ\n\t"
471  "# movaps %%xmmZ, %%xmmY\n\t"
472  "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
473  "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
474  "# addps %%xmmZ, %%xmmA\n\t"
475  "# addps %%xmmA, %%xmmC\n\t"
476  "# A=xmm0, B=xmm2, Z=xmm4\n\t"
477  "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
478  " movaps 16(%[edx]), %%xmm3\n\t"
479  " movaps %%xmm0, %%xmm4\n\t"
480  " xorps %%xmm1, %%xmm3\n\t"
481  " mulps %%xmm2, %%xmm0\n\t"
482  " movaps 16(%[eax]), %%xmm1\n\t"
483  " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
484  " movaps %%xmm1, %%xmm5\n\t"
485  " addps %%xmm0, %%xmm6\n\t"
486  " mulps %%xmm3, %%xmm1\n\t"
487  " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
488  " addps %%xmm1, %%xmm6\n\t"
489  " movaps 0(%[conjugator]), %%xmm1\n\t"
490  " mulps %%xmm4, %%xmm2\n\t"
491  " movaps 32(%[eax]), %%xmm0\n\t"
492  " addps %%xmm2, %%xmm7\n\t"
493  " mulps %%xmm5, %%xmm3\n\t"
494  " addl $32, %[eax]\n\t"
495  " movaps 32(%[edx]), %%xmm2\n\t"
496  " addps %%xmm3, %%xmm7\n\t"
497  " xorps %%xmm1, %%xmm2\n\t"
498  " addl $32, %[edx]\n\t"
499  ".%=L1_test:\n\t"
500  " decl %[ecx]\n\t"
501  " jge .%=Loop1\n\t"
502  " # We've handled the bulk of multiplies up to here.\n\t"
503  " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
504  " # If so, we've got 2 more taps to do.\n\t"
505  " movl 0(%[out]), %[ecx] # n_2_ccomplex_blocks\n\t"
506  " shrl $4, %[ecx]\n\t"
507  " andl $1, %[ecx]\n\t"
508  " je .%=Leven\n\t"
509  " # The count was odd, do 2 more taps.\n\t"
510  " # Note that we've already got mm0/mm2 preloaded\n\t"
511  " # from the main loop.\n\t"
512  " movaps %%xmm0, %%xmm4\n\t"
513  " mulps %%xmm2, %%xmm0\n\t"
514  " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
515  " addps %%xmm0, %%xmm6\n\t"
516  " mulps %%xmm4, %%xmm2\n\t"
517  " addps %%xmm2, %%xmm7\n\t"
518  ".%=Leven:\n\t"
519  " # neg inversor\n\t"
520  " #movl 8(%%ebp), %[eax] \n\t"
521  " xorps %%xmm1, %%xmm1\n\t"
522  " movl $0x80000000, (%[out])\n\t"
523  " movss (%[out]), %%xmm1\n\t"
524  " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
525  " # pfpnacc\n\t"
526  " xorps %%xmm1, %%xmm6\n\t"
527  " movaps %%xmm6, %%xmm2\n\t"
528  " unpcklps %%xmm7, %%xmm6\n\t"
529  " unpckhps %%xmm7, %%xmm2\n\t"
530  " movaps %%xmm2, %%xmm3\n\t"
531  " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
532  " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
533  " addps %%xmm2, %%xmm6\n\t"
534  " # xmm6 = r1 i2 r3 i4\n\t"
535  " #movl 8(%%ebp), %[eax] # @result\n\t"
536  " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
537  " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
538  " movlps %%xmm6, (%[out]) # store low 2x32 bits (complex) to memory\n\t"
539  " #popl %%ebp\n\t"
540  :
541  : [eax] "r" (input), [edx] "r" (taps), [ecx] "r" (num_bytes), [out] "r" (result), [conjugator] "r" (conjugator)
542  );
543 
544 
545 
546 
547  printf("%d, %d\n", leftovers, bound);
548 
549  for(; leftovers > 0; leftovers -= 8) {
550 
551 
552  *result += (input[(bound << 1)] * lv_conj(taps[(bound << 1)]));
553 
554  }
555 
556  return;
557 
558 
559 
560 
561 
562 
563 }
564 
565 #endif /*LV_HAVE_SSE*/
566 
567 
568 
569 #endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H*/
#define lv_conj(x)
Definition: volk_complex.h:80
unsigned int uint32_t
Definition: stdint.h:80
#define lv_cmake(r, i)
Definition: volk_complex.h:59
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:27
static const float taps[NSTEPS+1][NTAPS]
Definition: interpolator_taps.h:9
float complex lv_32fc_t
Definition: volk_complex.h:56
uint32_t i[4]
Definition: volk_common.h:80