GNU Radio Manual and C++ API Reference  3.7.6.1
The Free & Open Software Radio Ecosystem
volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 #ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
24 #define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
25 
26 #include<inttypes.h>
27 #include<stdio.h>
28 #include<volk/volk_complex.h>
29 #include <string.h>
30 
31 #ifdef LV_HAVE_SSE3
32 #include<xmmintrin.h>
33 #include<pmmintrin.h>
34 
35 static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_points) {
36 
37  const unsigned int num_bytes = num_points*8;
38 
39  __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
40 
41  lv_32fc_t diff;
42  memset(&diff, 0x0, 2*sizeof(float));
43 
44  float sq_dist = 0.0;
45  int bound = num_bytes >> 5;
46  int leftovers0 = (num_bytes >> 4) & 1;
47  int leftovers1 = (num_bytes >> 3) & 1;
48  int i = 0;
49 
50 
51 
52  xmm1 = _mm_setzero_ps();
53  xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);
54  xmm2 = _mm_load_ps((float*)&points[0]);
55  xmm8 = _mm_load1_ps(&scalar);
56  xmm1 = _mm_movelh_ps(xmm1, xmm1);
57  xmm3 = _mm_load_ps((float*)&points[2]);
58 
59 
60  for(; i < bound - 1; ++i) {
61 
62  xmm4 = _mm_sub_ps(xmm1, xmm2);
63  xmm5 = _mm_sub_ps(xmm1, xmm3);
64  points += 4;
65  xmm6 = _mm_mul_ps(xmm4, xmm4);
66  xmm7 = _mm_mul_ps(xmm5, xmm5);
67 
68  xmm2 = _mm_load_ps((float*)&points[0]);
69 
70  xmm4 = _mm_hadd_ps(xmm6, xmm7);
71 
72  xmm3 = _mm_load_ps((float*)&points[2]);
73 
74  xmm4 = _mm_mul_ps(xmm4, xmm8);
75 
76  _mm_store_ps(target, xmm4);
77 
78  target += 4;
79 
80  }
81 
82  xmm4 = _mm_sub_ps(xmm1, xmm2);
83  xmm5 = _mm_sub_ps(xmm1, xmm3);
84 
85 
86 
87  points += 4;
88  xmm6 = _mm_mul_ps(xmm4, xmm4);
89  xmm7 = _mm_mul_ps(xmm5, xmm5);
90 
91  xmm4 = _mm_hadd_ps(xmm6, xmm7);
92 
93  xmm4 = _mm_mul_ps(xmm4, xmm8);
94 
95  _mm_store_ps(target, xmm4);
96 
97  target += 4;
98 
99 
100  for(i = 0; i < leftovers0; ++i) {
101 
102  xmm2 = _mm_load_ps((float*)&points[0]);
103 
104  xmm4 = _mm_sub_ps(xmm1, xmm2);
105 
106  points += 2;
107 
108  xmm6 = _mm_mul_ps(xmm4, xmm4);
109 
110  xmm4 = _mm_hadd_ps(xmm6, xmm6);
111 
112  xmm4 = _mm_mul_ps(xmm4, xmm8);
113 
114  _mm_storeh_pi((__m64*)target, xmm4);
115 
116  target += 2;
117  }
118 
119  for(i = 0; i < leftovers1; ++i) {
120 
121  diff = src0[0] - points[0];
122 
123  sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));
124 
125  target[0] = sq_dist;
126  }
127 }
128 
129 #endif /*LV_HAVE_SSE3*/
130 
131 #ifdef LV_HAVE_GENERIC
132 static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_points) {
133 
134  const unsigned int num_bytes = num_points*8;
135 
136  lv_32fc_t diff;
137  float sq_dist;
138  unsigned int i = 0;
139 
140  for(; i < num_bytes >> 3; ++i) {
141  diff = src0[0] - points[i];
142 
143  sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));
144 
145  target[i] = sq_dist;
146  }
147 }
148 
149 #endif /*LV_HAVE_GENERIC*/
150 
151 
152 #endif /*INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H*/
float complex lv_32fc_t
Definition: volk_complex.h:56
#define lv_creal(x)
Definition: volk_complex.h:76
#define lv_cimag(x)
Definition: volk_complex.h:78