GNU Radio Manual and C++ API Reference  3.7.6.1
The Free & Open Software Radio Ecosystem
volk_32f_binary_slicer_8i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 #ifndef INCLUDED_volk_32f_binary_slicer_8i_H
24 #define INCLUDED_volk_32f_binary_slicer_8i_H
25 
26 
27 #ifdef LV_HAVE_GENERIC
28 /*!
29  \brief Returns integer 1 if float input is greater than or equal to 0, 1 otherwise
30  \param cVector The char (int8_t) output (either 0 or 1)
31  \param aVector The float input
32  \param num_points The number of values in aVector and stored into cVector
33 */
34 static inline void
35 volk_32f_binary_slicer_8i_generic(int8_t* cVector, const float* aVector,
36  unsigned int num_points)
37 {
38  int8_t* cPtr = cVector;
39  const float* aPtr = aVector;
40  unsigned int number = 0;
41 
42  for(number = 0; number < num_points; number++) {
43  if(*aPtr++ >= 0) {
44  *cPtr++ = 1;
45  }
46  else {
47  *cPtr++ = 0;
48  }
49  }
50 }
51 #endif /* LV_HAVE_GENERIC */
52 
53 
54 #ifdef LV_HAVE_GENERIC
55 /*!
56  \brief Returns integer 1 if float input is greater than or equal to 0, 1 otherwise
57  \param cVector The char (int8_t) output (either 0 or 1)
58  \param aVector The float input
59  \param num_points The number of values in aVector and stored into cVector
60 */
61 static inline void
62 volk_32f_binary_slicer_8i_generic_branchless(int8_t* cVector, const float* aVector,
63  unsigned int num_points)
64 {
65  int8_t* cPtr = cVector;
66  const float* aPtr = aVector;
67  unsigned int number = 0;
68 
69  for(number = 0; number < num_points; number++){
70  *cPtr++ = (*aPtr++ >= 0);
71  }
72 }
73 #endif /* LV_HAVE_GENERIC */
74 
75 
76 #ifdef LV_HAVE_SSE2
77 #include <emmintrin.h>
78 /*!
79  \brief Returns integer 1 if float input is greater than or equal to 0, 1 otherwise
80  \param cVector The char (int8_t) output (either 0 or 1)
81  \param aVector The float input
82  \param num_points The number of values in aVector and stored into cVector
83 */
84 static inline void
85 volk_32f_binary_slicer_8i_a_sse2(int8_t* cVector, const float* aVector,
86  unsigned int num_points)
87 {
88  int8_t* cPtr = cVector;
89  const float* aPtr = aVector;
90  unsigned int number = 0;
91 
92  unsigned int n16points = num_points / 16;
93  __m128 a0_val, a1_val, a2_val, a3_val;
94  __m128 res0_f, res1_f, res2_f, res3_f;
95  __m128i res0_i, res1_i, res2_i, res3_i;
96  __m128 zero_val;
97  zero_val = _mm_set1_ps(0.0f);
98 
99  for(number = 0; number < n16points; number++) {
100  a0_val = _mm_load_ps(aPtr);
101  a1_val = _mm_load_ps(aPtr+4);
102  a2_val = _mm_load_ps(aPtr+8);
103  a3_val = _mm_load_ps(aPtr+12);
104 
105  // compare >= 0; return float
106  res0_f = _mm_cmpge_ps(a0_val, zero_val);
107  res1_f = _mm_cmpge_ps(a1_val, zero_val);
108  res2_f = _mm_cmpge_ps(a2_val, zero_val);
109  res3_f = _mm_cmpge_ps(a3_val, zero_val);
110 
111  // convert to 32i and >> 31
112  res0_i = _mm_srli_epi32(_mm_cvtps_epi32(res0_f), 31);
113  res1_i = _mm_srli_epi32(_mm_cvtps_epi32(res1_f), 31);
114  res2_i = _mm_srli_epi32(_mm_cvtps_epi32(res2_f), 31);
115  res3_i = _mm_srli_epi32(_mm_cvtps_epi32(res3_f), 31);
116 
117  // pack into 16-bit results
118  res0_i = _mm_packs_epi32(res0_i, res1_i);
119  res2_i = _mm_packs_epi32(res2_i, res3_i);
120 
121  // pack into 8-bit results
122  res0_i = _mm_packs_epi16(res0_i, res2_i);
123 
124  _mm_store_si128((__m128i*)cPtr, res0_i);
125 
126  cPtr += 16;
127  aPtr += 16;
128  }
129 
130  for(number = n16points * 16; number < num_points; number++) {
131  if( *aPtr++ >= 0) {
132  *cPtr++ = 1;
133  }
134  else {
135  *cPtr++ = 0;
136  }
137  }
138 }
139 #endif /* LV_HAVE_SSE2 */
140 
141 
142 
143 #ifdef LV_HAVE_SSE2
144 #include <emmintrin.h>
145 /*!
146  \brief Returns integer 1 if float input is greater than or equal to 0, 1 otherwise
147  \param cVector The char (int8_t) output (either 0 or 1)
148  \param aVector The float input
149  \param num_points The number of values in aVector and stored into cVector
150 */
151 static inline void
152 volk_32f_binary_slicer_8i_u_sse2(int8_t* cVector, const float* aVector,
153  unsigned int num_points)
154 {
155  int8_t* cPtr = cVector;
156  const float* aPtr = aVector;
157  unsigned int number = 0;
158 
159  unsigned int n16points = num_points / 16;
160  __m128 a0_val, a1_val, a2_val, a3_val;
161  __m128 res0_f, res1_f, res2_f, res3_f;
162  __m128i res0_i, res1_i, res2_i, res3_i;
163  __m128 zero_val;
164  zero_val = _mm_set1_ps (0.0f);
165 
166  for(number = 0; number < n16points; number++) {
167  a0_val = _mm_loadu_ps(aPtr);
168  a1_val = _mm_loadu_ps(aPtr+4);
169  a2_val = _mm_loadu_ps(aPtr+8);
170  a3_val = _mm_loadu_ps(aPtr+12);
171 
172  // compare >= 0; return float
173  res0_f = _mm_cmpge_ps(a0_val, zero_val);
174  res1_f = _mm_cmpge_ps(a1_val, zero_val);
175  res2_f = _mm_cmpge_ps(a2_val, zero_val);
176  res3_f = _mm_cmpge_ps(a3_val, zero_val);
177 
178  // convert to 32i and >> 31
179  res0_i = _mm_srli_epi32(_mm_cvtps_epi32(res0_f), 31);
180  res1_i = _mm_srli_epi32(_mm_cvtps_epi32(res1_f), 31);
181  res2_i = _mm_srli_epi32(_mm_cvtps_epi32(res2_f), 31);
182  res3_i = _mm_srli_epi32(_mm_cvtps_epi32(res3_f), 31);
183 
184  // pack into 16-bit results
185  res0_i = _mm_packs_epi32(res0_i, res1_i);
186  res2_i = _mm_packs_epi32(res2_i, res3_i);
187 
188  // pack into 8-bit results
189  res0_i = _mm_packs_epi16(res0_i, res2_i);
190 
191  _mm_storeu_si128((__m128i*)cPtr, res0_i);
192 
193  cPtr += 16;
194  aPtr += 16;
195  }
196 
197  for(number = n16points * 16; number < num_points; number++) {
198  if( *aPtr++ >= 0) {
199  *cPtr++ = 1;
200  }
201  else {
202  *cPtr++ = 0;
203  }
204  }
205 }
206 #endif /* LV_HAVE_SSE2 */
207 
208 
209 #ifdef LV_HAVE_NEON
210 #include <arm_neon.h>
211 /*!
212  \brief Returns integer 1 if float input is greater than or equal to 0, 1 otherwise
213  \param cVector The char (int8_t) output (either 0 or 1)
214  \param aVector The float input
215  \param num_points The number of values in aVector and stored into cVector
216 */
217 static inline void
218 volk_32f_binary_slicer_8i_neon(int8_t* cVector, const float* aVector,
219  unsigned int num_points)
220 {
221  int8_t* cPtr = cVector;
222  const float* aPtr = aVector;
223  unsigned int number = 0;
224  unsigned int n16points = num_points / 16;
225 
226  float32x4x2_t input_val0, input_val1;
227  float32x4_t zero_val;
228  uint32x4x2_t res0_u32, res1_u32;
229  uint16x4x2_t res0_u16x4, res1_u16x4;
230  uint16x8x2_t res_u16x8;
231  uint8x8x2_t res_u8;
232  uint8x8_t one;
233 
234  zero_val = vdupq_n_f32(0.0);
235  one = vdup_n_u8(0x01);
236 
237  // TODO: this is a good candidate for asm because the vcombines
238  // can be eliminated simply by picking dst registers that are
239  // adjacent.
240  for(number = 0; number < n16points; number++) {
241  input_val0 = vld2q_f32(aPtr);
242  input_val1 = vld2q_f32(aPtr+8);
243 
244  // test against 0; return uint32
245  res0_u32.val[0] = vcgeq_f32(input_val0.val[0], zero_val);
246  res0_u32.val[1] = vcgeq_f32(input_val0.val[1], zero_val);
247  res1_u32.val[0] = vcgeq_f32(input_val1.val[0], zero_val);
248  res1_u32.val[1] = vcgeq_f32(input_val1.val[1], zero_val);
249 
250  // narrow uint32 -> uint16 followed by combine to 8-element vectors
251  res0_u16x4.val[0] = vmovn_u32(res0_u32.val[0]);
252  res0_u16x4.val[1] = vmovn_u32(res0_u32.val[1]);
253  res1_u16x4.val[0] = vmovn_u32(res1_u32.val[0]);
254  res1_u16x4.val[1] = vmovn_u32(res1_u32.val[1]);
255 
256  res_u16x8.val[0] = vcombine_u16(res0_u16x4.val[0], res1_u16x4.val[0]);
257  res_u16x8.val[1] = vcombine_u16(res0_u16x4.val[1], res1_u16x4.val[1]);
258 
259  // narrow uint16x8 -> uint8x8
260  res_u8.val[0] = vmovn_u16(res_u16x8.val[0]);
261  res_u8.val[1] = vmovn_u16(res_u16x8.val[1]);
262  // we *could* load twice as much data and do another vcombine here
263  // to get a uint8x16x2 vector, still only do 2 vandqs and a single store
264  // but that turns out to be ~16% slower than this version on zc702
265  // it's possible register contention in GCC scheduler slows it down
266  // and a hand-written asm with quad-word u8 registers is much faster.
267 
268  res_u8.val[0] = vand_u8(one, res_u8.val[0]);
269  res_u8.val[1] = vand_u8(one, res_u8.val[1]);
270 
271  vst2_u8((unsigned char*)cPtr, res_u8);
272  cPtr += 16;
273  aPtr += 16;
274 
275  }
276 
277  for(number = n16points * 16; number < num_points; number++) {
278  if(*aPtr++ >= 0) {
279  *cPtr++ = 1;
280  }
281  else {
282  *cPtr++ = 0;
283  }
284  }
285 }
286 #endif /* LV_HAVE_NEON */
287 
288 
289 #endif /* INCLUDED_volk_32f_binary_slicer_8i_H */
signed char int8_t
Definition: stdint.h:75