23 #ifndef INCLUDED_volk_32f_binary_slicer_8i_H
24 #define INCLUDED_volk_32f_binary_slicer_8i_H
27 #ifdef LV_HAVE_GENERIC
35 volk_32f_binary_slicer_8i_generic(
int8_t* cVector,
const float* aVector,
36 unsigned int num_points)
39 const float* aPtr = aVector;
40 unsigned int number = 0;
42 for(number = 0; number < num_points; number++) {
54 #ifdef LV_HAVE_GENERIC
62 volk_32f_binary_slicer_8i_generic_branchless(
int8_t* cVector,
const float* aVector,
63 unsigned int num_points)
66 const float* aPtr = aVector;
67 unsigned int number = 0;
69 for(number = 0; number < num_points; number++){
70 *cPtr++ = (*aPtr++ >= 0);
77 #include <emmintrin.h>
85 volk_32f_binary_slicer_8i_a_sse2(
int8_t* cVector,
const float* aVector,
86 unsigned int num_points)
89 const float* aPtr = aVector;
90 unsigned int number = 0;
92 unsigned int n16points = num_points / 16;
93 __m128 a0_val, a1_val, a2_val, a3_val;
94 __m128 res0_f, res1_f, res2_f, res3_f;
95 __m128i res0_i, res1_i, res2_i, res3_i;
97 zero_val = _mm_set1_ps(0.0f);
99 for(number = 0; number < n16points; number++) {
100 a0_val = _mm_load_ps(aPtr);
101 a1_val = _mm_load_ps(aPtr+4);
102 a2_val = _mm_load_ps(aPtr+8);
103 a3_val = _mm_load_ps(aPtr+12);
106 res0_f = _mm_cmpge_ps(a0_val, zero_val);
107 res1_f = _mm_cmpge_ps(a1_val, zero_val);
108 res2_f = _mm_cmpge_ps(a2_val, zero_val);
109 res3_f = _mm_cmpge_ps(a3_val, zero_val);
112 res0_i = _mm_srli_epi32(_mm_cvtps_epi32(res0_f), 31);
113 res1_i = _mm_srli_epi32(_mm_cvtps_epi32(res1_f), 31);
114 res2_i = _mm_srli_epi32(_mm_cvtps_epi32(res2_f), 31);
115 res3_i = _mm_srli_epi32(_mm_cvtps_epi32(res3_f), 31);
118 res0_i = _mm_packs_epi32(res0_i, res1_i);
119 res2_i = _mm_packs_epi32(res2_i, res3_i);
122 res0_i = _mm_packs_epi16(res0_i, res2_i);
124 _mm_store_si128((__m128i*)cPtr, res0_i);
130 for(number = n16points * 16; number < num_points; number++) {
144 #include <emmintrin.h>
152 volk_32f_binary_slicer_8i_u_sse2(
int8_t* cVector,
const float* aVector,
153 unsigned int num_points)
156 const float* aPtr = aVector;
157 unsigned int number = 0;
159 unsigned int n16points = num_points / 16;
160 __m128 a0_val, a1_val, a2_val, a3_val;
161 __m128 res0_f, res1_f, res2_f, res3_f;
162 __m128i res0_i, res1_i, res2_i, res3_i;
164 zero_val = _mm_set1_ps (0.0f);
166 for(number = 0; number < n16points; number++) {
167 a0_val = _mm_loadu_ps(aPtr);
168 a1_val = _mm_loadu_ps(aPtr+4);
169 a2_val = _mm_loadu_ps(aPtr+8);
170 a3_val = _mm_loadu_ps(aPtr+12);
173 res0_f = _mm_cmpge_ps(a0_val, zero_val);
174 res1_f = _mm_cmpge_ps(a1_val, zero_val);
175 res2_f = _mm_cmpge_ps(a2_val, zero_val);
176 res3_f = _mm_cmpge_ps(a3_val, zero_val);
179 res0_i = _mm_srli_epi32(_mm_cvtps_epi32(res0_f), 31);
180 res1_i = _mm_srli_epi32(_mm_cvtps_epi32(res1_f), 31);
181 res2_i = _mm_srli_epi32(_mm_cvtps_epi32(res2_f), 31);
182 res3_i = _mm_srli_epi32(_mm_cvtps_epi32(res3_f), 31);
185 res0_i = _mm_packs_epi32(res0_i, res1_i);
186 res2_i = _mm_packs_epi32(res2_i, res3_i);
189 res0_i = _mm_packs_epi16(res0_i, res2_i);
191 _mm_storeu_si128((__m128i*)cPtr, res0_i);
197 for(number = n16points * 16; number < num_points; number++) {
210 #include <arm_neon.h>
218 volk_32f_binary_slicer_8i_neon(
int8_t* cVector,
const float* aVector,
219 unsigned int num_points)
222 const float* aPtr = aVector;
223 unsigned int number = 0;
224 unsigned int n16points = num_points / 16;
226 float32x4x2_t input_val0, input_val1;
227 float32x4_t zero_val;
228 uint32x4x2_t res0_u32, res1_u32;
229 uint16x4x2_t res0_u16x4, res1_u16x4;
230 uint16x8x2_t res_u16x8;
234 zero_val = vdupq_n_f32(0.0);
235 one = vdup_n_u8(0x01);
240 for(number = 0; number < n16points; number++) {
241 input_val0 = vld2q_f32(aPtr);
242 input_val1 = vld2q_f32(aPtr+8);
245 res0_u32.val[0] = vcgeq_f32(input_val0.val[0], zero_val);
246 res0_u32.val[1] = vcgeq_f32(input_val0.val[1], zero_val);
247 res1_u32.val[0] = vcgeq_f32(input_val1.val[0], zero_val);
248 res1_u32.val[1] = vcgeq_f32(input_val1.val[1], zero_val);
251 res0_u16x4.val[0] = vmovn_u32(res0_u32.val[0]);
252 res0_u16x4.val[1] = vmovn_u32(res0_u32.val[1]);
253 res1_u16x4.val[0] = vmovn_u32(res1_u32.val[0]);
254 res1_u16x4.val[1] = vmovn_u32(res1_u32.val[1]);
256 res_u16x8.val[0] = vcombine_u16(res0_u16x4.val[0], res1_u16x4.val[0]);
257 res_u16x8.val[1] = vcombine_u16(res0_u16x4.val[1], res1_u16x4.val[1]);
260 res_u8.val[0] = vmovn_u16(res_u16x8.val[0]);
261 res_u8.val[1] = vmovn_u16(res_u16x8.val[1]);
268 res_u8.val[0] = vand_u8(one, res_u8.val[0]);
269 res_u8.val[1] = vand_u8(one, res_u8.val[1]);
271 vst2_u8((
unsigned char*)cPtr, res_u8);
277 for(number = n16points * 16; number < num_points; number++) {
signed char int8_t
Definition: stdint.h:75