GNU Radio Manual and C++ API Reference  3.7.6.1
The Free & Open Software Radio Ecosystem
volk_32fc_x2_square_dist_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 #ifndef INCLUDED_volk_32fc_x2_square_dist_32f_a_H
24 #define INCLUDED_volk_32fc_x2_square_dist_32f_a_H
25 
26 #include<inttypes.h>
27 #include<stdio.h>
28 #include<volk/volk_complex.h>
29 
30 #ifdef LV_HAVE_SSE3
31 #include<xmmintrin.h>
32 #include<pmmintrin.h>
33 
34 static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_points) {
35 
36  const unsigned int num_bytes = num_points*8;
37 
38  __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
39 
40  lv_32fc_t diff;
41  float sq_dist;
42  int bound = num_bytes >> 5;
43  int leftovers0 = (num_bytes >> 4) & 1;
44  int leftovers1 = (num_bytes >> 3) & 1;
45  int i = 0;
46 
47  xmm1 = _mm_setzero_ps();
48  xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);
49  xmm2 = _mm_load_ps((float*)&points[0]);
50  xmm1 = _mm_movelh_ps(xmm1, xmm1);
51  xmm3 = _mm_load_ps((float*)&points[2]);
52 
53 
54  for(; i < bound - 1; ++i) {
55  xmm4 = _mm_sub_ps(xmm1, xmm2);
56  xmm5 = _mm_sub_ps(xmm1, xmm3);
57  points += 4;
58  xmm6 = _mm_mul_ps(xmm4, xmm4);
59  xmm7 = _mm_mul_ps(xmm5, xmm5);
60 
61  xmm2 = _mm_load_ps((float*)&points[0]);
62 
63  xmm4 = _mm_hadd_ps(xmm6, xmm7);
64 
65  xmm3 = _mm_load_ps((float*)&points[2]);
66 
67  _mm_store_ps(target, xmm4);
68 
69  target += 4;
70 
71  }
72 
73  xmm4 = _mm_sub_ps(xmm1, xmm2);
74  xmm5 = _mm_sub_ps(xmm1, xmm3);
75 
76 
77 
78  points += 4;
79  xmm6 = _mm_mul_ps(xmm4, xmm4);
80  xmm7 = _mm_mul_ps(xmm5, xmm5);
81 
82  xmm4 = _mm_hadd_ps(xmm6, xmm7);
83 
84  _mm_store_ps(target, xmm4);
85 
86  target += 4;
87 
88  for(i = 0; i < leftovers0; ++i) {
89 
90  xmm2 = _mm_load_ps((float*)&points[0]);
91 
92  xmm4 = _mm_sub_ps(xmm1, xmm2);
93 
94  points += 2;
95 
96  xmm6 = _mm_mul_ps(xmm4, xmm4);
97 
98  xmm4 = _mm_hadd_ps(xmm6, xmm6);
99 
100  _mm_storeh_pi((__m64*)target, xmm4);
101 
102  target += 2;
103  }
104 
105  for(i = 0; i < leftovers1; ++i) {
106 
107  diff = src0[0] - points[0];
108 
109  sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
110 
111  target[0] = sq_dist;
112  }
113 }
114 
115 #endif /*LV_HAVE_SSE3*/
116 
117 #ifdef LV_HAVE_NEON
118 #include <arm_neon.h>
119 static inline void volk_32fc_x2_square_dist_32f_neon(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_points) {
120  const unsigned int quarter_points = num_points / 4;
121  unsigned int number;
122 
123  float32x4x2_t a_vec, b_vec;
124  float32x4x2_t diff_vec;
125  float32x4_t tmp, tmp1, dist_sq;
126  a_vec.val[0] = vdupq_n_f32( lv_creal(src0[0]) );
127  a_vec.val[1] = vdupq_n_f32( lv_cimag(src0[0]) );
128  for(number=0; number < quarter_points; ++number) {
129  b_vec = vld2q_f32((float*)points);
130  diff_vec.val[0] = vsubq_f32(a_vec.val[0], b_vec.val[0]);
131  diff_vec.val[1] = vsubq_f32(a_vec.val[1], b_vec.val[1]);
132  tmp = vmulq_f32(diff_vec.val[0], diff_vec.val[0]);
133  tmp1 = vmulq_f32(diff_vec.val[1], diff_vec.val[1]);
134 
135  dist_sq = vaddq_f32(tmp, tmp1);
136  vst1q_f32(target, dist_sq);
137  points += 4;
138  target += 4;
139  }
140  for(number=quarter_points*4; number < num_points; ++number) {
141  lv_32fc_t diff = src0[0] - *points++;
142  *target++ = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
143  }
144 }
145 #endif /* LV_HAVE_NEON */
146 
147 #ifdef LV_HAVE_GENERIC
148 static inline void volk_32fc_x2_square_dist_32f_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_points) {
149 
150  const unsigned int num_bytes = num_points*8;
151 
152  lv_32fc_t diff;
153  float sq_dist;
154  unsigned int i = 0;
155 
156  for(; i < num_bytes >> 3; ++i) {
157  diff = src0[0] - points[i];
158 
159  sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
160 
161  target[i] = sq_dist;
162  }
163 }
164 
165 #endif /*LV_HAVE_GENERIC*/
166 
167 
168 #endif /*INCLUDED_volk_32fc_x2_square_dist_32f_a_H*/
float complex lv_32fc_t
Definition: volk_complex.h:56
#define lv_creal(x)
Definition: volk_complex.h:76
#define lv_cimag(x)
Definition: volk_complex.h:78