GNU Radio Manual and C++ API Reference  3.7.6.1
The Free & Open Software Radio Ecosystem
volk_32fc_index_max_16u.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 #ifndef INCLUDED_volk_32fc_index_max_16u_a_H
24 #define INCLUDED_volk_32fc_index_max_16u_a_H
25 
26 #include <volk/volk_common.h>
27 #include<inttypes.h>
28 #include<stdio.h>
29 #include<volk/volk_complex.h>
30 
31 #ifdef LV_HAVE_SSE3
32 #include<xmmintrin.h>
33 #include<pmmintrin.h>
34 
35 
36 static inline void volk_32fc_index_max_16u_a_sse3(unsigned int* target, lv_32fc_t* src0, unsigned int num_points) {
37 
38  const unsigned int num_bytes = num_points*8;
39 
40  union bit128 holderf;
41  union bit128 holderi;
42  float sq_dist = 0.0;
43 
44 
45 
46 
47  union bit128 xmm5, xmm4;
48  __m128 xmm1, xmm2, xmm3;
49  __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
50 
51  xmm5.int_vec = xmmfive = _mm_setzero_si128();
52  xmm4.int_vec = xmmfour = _mm_setzero_si128();
53  holderf.int_vec = holder0 = _mm_setzero_si128();
54  holderi.int_vec = holder1 = _mm_setzero_si128();
55 
56 
57  int bound = num_bytes >> 5;
58  int leftovers0 = (num_bytes >> 4) & 1;
59  int leftovers1 = (num_bytes >> 3) & 1;
60  int i = 0;
61 
62 
63  xmm8 = _mm_set_epi32(3, 2, 1, 0);//remember the crazy reverse order!
64  xmm9 = xmm8 = _mm_setzero_si128();
65  xmm10 = _mm_set_epi32(4, 4, 4, 4);
66  xmm3 = _mm_setzero_ps();
67 ;
68 
69  //printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1], ((float*)&xmm10)[2], ((float*)&xmm10)[3]);
70 
71  for(; i < bound; ++i) {
72 
73  xmm1 = _mm_load_ps((float*)src0);
74  xmm2 = _mm_load_ps((float*)&src0[2]);
75 
76 
77  src0 += 4;
78 
79 
80  xmm1 = _mm_mul_ps(xmm1, xmm1);
81  xmm2 = _mm_mul_ps(xmm2, xmm2);
82 
83 
84  xmm1 = _mm_hadd_ps(xmm1, xmm2);
85 
86  xmm3 = _mm_max_ps(xmm1, xmm3);
87 
88  xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
89  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
90 
91 
92 
93  xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
94  xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
95 
96  xmm9 = _mm_add_epi32(xmm11, xmm12);
97 
98  xmm8 = _mm_add_epi32(xmm8, xmm10);
99 
100 
101  //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]);
102  //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2], ((uint32_t*)&xmm10)[3]);
103 
104  }
105 
106 
107  for(i = 0; i < leftovers0; ++i) {
108 
109 
110  xmm2 = _mm_load_ps((float*)src0);
111 
112  xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
113  xmm8 = bit128_p(&xmm1)->int_vec;
114 
115  xmm2 = _mm_mul_ps(xmm2, xmm2);
116 
117  src0 += 2;
118 
119  xmm1 = _mm_hadd_ps(xmm2, xmm2);
120 
121  xmm3 = _mm_max_ps(xmm1, xmm3);
122 
123  xmm10 = _mm_set_epi32(2, 2, 2, 2);//load1_ps((float*)&init[2]);
124 
125 
126  xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
127  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
128 
129 
130 
131  xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
132  xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
133 
134  xmm9 = _mm_add_epi32(xmm11, xmm12);
135 
136  xmm8 = _mm_add_epi32(xmm8, xmm10);
137  //printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
138 
139  }
140 
141 
142 
143 
144  for(i = 0; i < leftovers1; ++i) {
145  //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
146 
147 
148  sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
149 
150  xmm2 = _mm_load1_ps(&sq_dist);
151 
152  xmm1 = xmm3;
153 
154  xmm3 = _mm_max_ss(xmm3, xmm2);
155 
156 
157 
158  xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
159  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
160 
161 
162  xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
163 
164  xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
165  xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
166 
167 
168  xmm9 = _mm_add_epi32(xmm11, xmm12);
169 
170  }
171 
172  //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]);
173 
174  //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
175 
176  _mm_store_ps((float*)&(holderf.f), xmm3);
177  _mm_store_si128(&(holderi.int_vec), xmm9);
178 
179  target[0] = holderi.i[0];
180  sq_dist = holderf.f[0];
181  target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
182  sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
183  target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
184  sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
185  target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
186  sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
187 
188 
189 
190  /*
191  float placeholder = 0.0;
192  uint32_t temp0, temp1;
193  unsigned int g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]);
194  unsigned int l0 = g0 ^ 1;
195 
196  unsigned int g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]);
197  unsigned int l1 = g1 ^ 1;
198 
199  temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1];
200  temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3];
201  sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1];
202  placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3];
203 
204  g0 = (sq_dist > placeholder);
205  l0 = g0 ^ 1;
206  target[0] = g0 * temp0 + l0 * temp1;
207  */
208 
209 }
210 
211 #endif /*LV_HAVE_SSE3*/
212 
213 #ifdef LV_HAVE_GENERIC
214 static inline void volk_32fc_index_max_16u_generic(unsigned int* target, lv_32fc_t* src0, unsigned int num_points) {
215 
216  const unsigned int num_bytes = num_points*8;
217 
218  float sq_dist = 0.0;
219  float max = 0.0;
220  unsigned int index = 0;
221 
222  unsigned int i = 0;
223 
224  for(; i < num_bytes >> 3; ++i) {
225 
226  sq_dist = lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
227 
228  index = sq_dist > max ? i : index;
229  max = sq_dist > max ? sq_dist : max;
230 
231 
232  }
233  target[0] = index;
234 
235 }
236 
237 #endif /*LV_HAVE_GENERIC*/
238 
239 
240 #endif /*INCLUDED_volk_32fc_index_max_16u_a_H*/
#define bit128_p(x)
Definition: volk_common.h:94
float complex lv_32fc_t
Definition: volk_complex.h:56
#define lv_creal(x)
Definition: volk_complex.h:76
Definition: volk_common.h:78
#define lv_cimag(x)
Definition: volk_complex.h:78
uint32_t i[4]
Definition: volk_common.h:80