23 #ifndef INCLUDED_volk_32fc_magnitude_32f_u_H
24 #define INCLUDED_volk_32fc_magnitude_32f_u_H
31 #include <immintrin.h>
38 static inline void volk_32fc_magnitude_32f_u_avx(
float* magnitudeVector,
const lv_32fc_t* complexVector,
unsigned int num_points){
39 unsigned int number = 0;
40 const unsigned int eighthPoints = num_points / 8;
42 const float* complexVectorPtr = (
float*)complexVector;
43 float* magnitudeVectorPtr = magnitudeVector;
45 __m256 cplxValue1, cplxValue2, complex1, complex2, result;
46 for(;number < eighthPoints; number++){
47 cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
48 complexVectorPtr += 8;
50 cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
51 complexVectorPtr += 8;
53 cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1);
54 cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2);
56 complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
57 complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
59 result = _mm256_hadd_ps(complex1, complex2);
61 result = _mm256_sqrt_ps(result);
63 _mm256_storeu_ps(magnitudeVectorPtr, result);
64 magnitudeVectorPtr += 8;
67 number = eighthPoints * 8;
68 for(; number < num_points; number++){
69 float val1Real = *complexVectorPtr++;
70 float val1Imag = *complexVectorPtr++;
71 *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
77 #include <pmmintrin.h>
84 static inline void volk_32fc_magnitude_32f_u_sse3(
float* magnitudeVector,
const lv_32fc_t* complexVector,
unsigned int num_points){
85 unsigned int number = 0;
86 const unsigned int quarterPoints = num_points / 4;
88 const float* complexVectorPtr = (
float*)complexVector;
89 float* magnitudeVectorPtr = magnitudeVector;
91 __m128 cplxValue1, cplxValue2, result;
92 for(;number < quarterPoints; number++){
93 cplxValue1 = _mm_loadu_ps(complexVectorPtr);
94 complexVectorPtr += 4;
96 cplxValue2 = _mm_loadu_ps(complexVectorPtr);
97 complexVectorPtr += 4;
99 cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1);
100 cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2);
102 result = _mm_hadd_ps(cplxValue1, cplxValue2);
104 result = _mm_sqrt_ps(result);
106 _mm_storeu_ps(magnitudeVectorPtr, result);
107 magnitudeVectorPtr += 4;
110 number = quarterPoints * 4;
111 for(; number < num_points; number++){
112 float val1Real = *complexVectorPtr++;
113 float val1Imag = *complexVectorPtr++;
114 *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
120 #include <xmmintrin.h>
127 static inline void volk_32fc_magnitude_32f_u_sse(
float* magnitudeVector,
const lv_32fc_t* complexVector,
unsigned int num_points){
128 unsigned int number = 0;
129 const unsigned int quarterPoints = num_points / 4;
131 const float* complexVectorPtr = (
float*)complexVector;
132 float* magnitudeVectorPtr = magnitudeVector;
134 __m128 cplxValue1, cplxValue2, iValue, qValue, result;
135 for(;number < quarterPoints; number++){
136 cplxValue1 = _mm_loadu_ps(complexVectorPtr);
137 complexVectorPtr += 4;
139 cplxValue2 = _mm_loadu_ps(complexVectorPtr);
140 complexVectorPtr += 4;
143 iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
145 qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
147 iValue = _mm_mul_ps(iValue, iValue);
148 qValue = _mm_mul_ps(qValue, qValue);
150 result = _mm_add_ps(iValue, qValue);
152 result = _mm_sqrt_ps(result);
154 _mm_storeu_ps(magnitudeVectorPtr, result);
155 magnitudeVectorPtr += 4;
158 number = quarterPoints * 4;
159 for(; number < num_points; number++){
160 float val1Real = *complexVectorPtr++;
161 float val1Imag = *complexVectorPtr++;
162 *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
167 #ifdef LV_HAVE_GENERIC
174 static inline void volk_32fc_magnitude_32f_generic(
float* magnitudeVector,
const lv_32fc_t* complexVector,
unsigned int num_points){
175 const float* complexVectorPtr = (
float*)complexVector;
176 float* magnitudeVectorPtr = magnitudeVector;
177 unsigned int number = 0;
178 for(number = 0; number < num_points; number++){
179 const float real = *complexVectorPtr++;
180 const float imag = *complexVectorPtr++;
181 *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
187 #ifndef INCLUDED_volk_32fc_magnitude_32f_a_H
188 #define INCLUDED_volk_32fc_magnitude_32f_a_H
195 #include <immintrin.h>
202 static inline void volk_32fc_magnitude_32f_a_avx(
float* magnitudeVector,
const lv_32fc_t* complexVector,
unsigned int num_points){
203 unsigned int number = 0;
204 const unsigned int eighthPoints = num_points / 8;
206 const float* complexVectorPtr = (
float*)complexVector;
207 float* magnitudeVectorPtr = magnitudeVector;
209 __m256 cplxValue1, cplxValue2, complex1, complex2, result;
210 for(;number < eighthPoints; number++){
211 cplxValue1 = _mm256_load_ps(complexVectorPtr);
212 complexVectorPtr += 8;
214 cplxValue2 = _mm256_load_ps(complexVectorPtr);
215 complexVectorPtr += 8;
217 cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1);
218 cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2);
220 complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
221 complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
223 result = _mm256_hadd_ps(complex1, complex2);
225 result = _mm256_sqrt_ps(result);
227 _mm256_store_ps(magnitudeVectorPtr, result);
228 magnitudeVectorPtr += 8;
231 number = eighthPoints * 8;
232 for(; number < num_points; number++){
233 float val1Real = *complexVectorPtr++;
234 float val1Imag = *complexVectorPtr++;
235 *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
241 #include <pmmintrin.h>
248 static inline void volk_32fc_magnitude_32f_a_sse3(
float* magnitudeVector,
const lv_32fc_t* complexVector,
unsigned int num_points){
249 unsigned int number = 0;
250 const unsigned int quarterPoints = num_points / 4;
252 const float* complexVectorPtr = (
float*)complexVector;
253 float* magnitudeVectorPtr = magnitudeVector;
255 __m128 cplxValue1, cplxValue2, result;
256 for(;number < quarterPoints; number++){
257 cplxValue1 = _mm_load_ps(complexVectorPtr);
258 complexVectorPtr += 4;
260 cplxValue2 = _mm_load_ps(complexVectorPtr);
261 complexVectorPtr += 4;
263 cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1);
264 cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2);
266 result = _mm_hadd_ps(cplxValue1, cplxValue2);
268 result = _mm_sqrt_ps(result);
270 _mm_store_ps(magnitudeVectorPtr, result);
271 magnitudeVectorPtr += 4;
274 number = quarterPoints * 4;
275 for(; number < num_points; number++){
276 float val1Real = *complexVectorPtr++;
277 float val1Imag = *complexVectorPtr++;
278 *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
284 #include <xmmintrin.h>
291 static inline void volk_32fc_magnitude_32f_a_sse(
float* magnitudeVector,
const lv_32fc_t* complexVector,
unsigned int num_points){
292 unsigned int number = 0;
293 const unsigned int quarterPoints = num_points / 4;
295 const float* complexVectorPtr = (
float*)complexVector;
296 float* magnitudeVectorPtr = magnitudeVector;
298 __m128 cplxValue1, cplxValue2, iValue, qValue, result;
299 for(;number < quarterPoints; number++){
300 cplxValue1 = _mm_load_ps(complexVectorPtr);
301 complexVectorPtr += 4;
303 cplxValue2 = _mm_load_ps(complexVectorPtr);
304 complexVectorPtr += 4;
307 iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
309 qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
311 iValue = _mm_mul_ps(iValue, iValue);
312 qValue = _mm_mul_ps(qValue, qValue);
314 result = _mm_add_ps(iValue, qValue);
316 result = _mm_sqrt_ps(result);
318 _mm_store_ps(magnitudeVectorPtr, result);
319 magnitudeVectorPtr += 4;
322 number = quarterPoints * 4;
323 for(; number < num_points; number++){
324 float val1Real = *complexVectorPtr++;
325 float val1Imag = *complexVectorPtr++;
326 *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
331 #ifdef LV_HAVE_GENERIC
338 static inline void volk_32fc_magnitude_32f_a_generic(
float* magnitudeVector,
const lv_32fc_t* complexVector,
unsigned int num_points){
339 const float* complexVectorPtr = (
float*)complexVector;
340 float* magnitudeVectorPtr = magnitudeVector;
341 unsigned int number = 0;
342 for(number = 0; number < num_points; number++){
343 const float real = *complexVectorPtr++;
344 const float imag = *complexVectorPtr++;
345 *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
351 #include <arm_neon.h>
359 static inline void volk_32fc_magnitude_32f_neon(
float* magnitudeVector,
const lv_32fc_t* complexVector,
unsigned int num_points){
361 unsigned int quarter_points = num_points / 4;
362 const float* complexVectorPtr = (
float*)complexVector;
363 float* magnitudeVectorPtr = magnitudeVector;
365 float32x4x2_t complex_vec;
366 float32x4_t magnitude_vec;
367 for(number = 0; number < quarter_points; number++){
368 complex_vec = vld2q_f32(complexVectorPtr);
369 complex_vec.val[0] = vmulq_f32(complex_vec.val[0], complex_vec.val[0]);
370 magnitude_vec = vmlaq_f32(complex_vec.val[0], complex_vec.val[1], complex_vec.val[1]);
371 magnitude_vec = vrsqrteq_f32(magnitude_vec);
372 magnitude_vec = vrecpeq_f32( magnitude_vec );
373 vst1q_f32(magnitudeVectorPtr, magnitude_vec);
375 complexVectorPtr += 8;
376 magnitudeVectorPtr += 4;
379 for(number = quarter_points*4; number < num_points; number++){
380 const float real = *complexVectorPtr++;
381 const float imag = *complexVectorPtr++;
382 *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
402 static inline void volk_32fc_magnitude_32f_neon_fancy_sweet(
float* magnitudeVector,
const lv_32fc_t* complexVector,
unsigned int num_points){
404 unsigned int quarter_points = num_points / 4;
405 const float* complexVectorPtr = (
float*)complexVector;
406 float* magnitudeVectorPtr = magnitudeVector;
408 const float threshold = 0.4142135;
410 float32x4_t a_vec, b_vec, a_high, a_low, b_high, b_low;
411 a_high = vdupq_n_f32( 0.84 );
412 b_high = vdupq_n_f32( 0.561);
413 a_low = vdupq_n_f32( 0.99 );
414 b_low = vdupq_n_f32( 0.197);
416 uint32x4_t comp0, comp1;
418 float32x4x2_t complex_vec;
419 float32x4_t min_vec, max_vec, magnitude_vec;
420 float32x4_t real_abs, imag_abs;
421 for(number = 0; number < quarter_points; number++){
422 complex_vec = vld2q_f32(complexVectorPtr);
424 real_abs = vabsq_f32(complex_vec.val[0]);
425 imag_abs = vabsq_f32(complex_vec.val[1]);
427 min_vec = vminq_f32(real_abs, imag_abs);
428 max_vec = vmaxq_f32(real_abs, imag_abs);
431 comp0 = vcgtq_f32(min_vec, vmulq_n_f32(max_vec, threshold));
432 comp1 = vcleq_f32(min_vec, vmulq_n_f32(max_vec, threshold));
435 a_vec = (float32x4_t)vaddq_s32(vandq_s32((int32x4_t)comp0, (int32x4_t)a_high), vandq_s32((int32x4_t)comp1, (int32x4_t)a_low));
436 b_vec = (float32x4_t)vaddq_s32(vandq_s32((int32x4_t)comp0, (int32x4_t)b_high), vandq_s32((int32x4_t)comp1, (int32x4_t)b_low));
439 min_vec = vmulq_f32(min_vec, b_vec);
440 max_vec = vmulq_f32(max_vec, a_vec);
442 magnitude_vec = vaddq_f32(min_vec, max_vec);
443 vst1q_f32(magnitudeVectorPtr, magnitude_vec);
445 complexVectorPtr += 8;
446 magnitudeVectorPtr += 4;
449 for(number = quarter_points*4; number < num_points; number++){
450 const float real = *complexVectorPtr++;
451 const float imag = *complexVectorPtr++;
452 *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
465 extern void volk_32fc_magnitude_32f_a_orc_impl(
float* magnitudeVector,
const lv_32fc_t* complexVector,
unsigned int num_points);
466 static inline void volk_32fc_magnitude_32f_u_orc(
float* magnitudeVector,
const lv_32fc_t* complexVector,
unsigned int num_points){
467 volk_32fc_magnitude_32f_a_orc_impl(magnitudeVector, complexVector, num_points);
float complex lv_32fc_t
Definition: volk_complex.h:56