23 #ifndef INCLUDED_volk_32fc_x2_square_dist_32f_a_H
24 #define INCLUDED_volk_32fc_x2_square_dist_32f_a_H
34 static inline void volk_32fc_x2_square_dist_32f_a_sse3(
float* target,
lv_32fc_t* src0,
lv_32fc_t* points,
unsigned int num_points) {
36 const unsigned int num_bytes = num_points*8;
38 __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
42 int bound = num_bytes >> 5;
43 int leftovers0 = (num_bytes >> 4) & 1;
44 int leftovers1 = (num_bytes >> 3) & 1;
47 xmm1 = _mm_setzero_ps();
48 xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);
49 xmm2 = _mm_load_ps((
float*)&points[0]);
50 xmm1 = _mm_movelh_ps(xmm1, xmm1);
51 xmm3 = _mm_load_ps((
float*)&points[2]);
54 for(; i < bound - 1; ++i) {
55 xmm4 = _mm_sub_ps(xmm1, xmm2);
56 xmm5 = _mm_sub_ps(xmm1, xmm3);
58 xmm6 = _mm_mul_ps(xmm4, xmm4);
59 xmm7 = _mm_mul_ps(xmm5, xmm5);
61 xmm2 = _mm_load_ps((
float*)&points[0]);
63 xmm4 = _mm_hadd_ps(xmm6, xmm7);
65 xmm3 = _mm_load_ps((
float*)&points[2]);
67 _mm_store_ps(target, xmm4);
73 xmm4 = _mm_sub_ps(xmm1, xmm2);
74 xmm5 = _mm_sub_ps(xmm1, xmm3);
79 xmm6 = _mm_mul_ps(xmm4, xmm4);
80 xmm7 = _mm_mul_ps(xmm5, xmm5);
82 xmm4 = _mm_hadd_ps(xmm6, xmm7);
84 _mm_store_ps(target, xmm4);
88 for(i = 0; i < leftovers0; ++i) {
90 xmm2 = _mm_load_ps((
float*)&points[0]);
92 xmm4 = _mm_sub_ps(xmm1, xmm2);
96 xmm6 = _mm_mul_ps(xmm4, xmm4);
98 xmm4 = _mm_hadd_ps(xmm6, xmm6);
100 _mm_storeh_pi((__m64*)target, xmm4);
105 for(i = 0; i < leftovers1; ++i) {
107 diff = src0[0] - points[0];
118 #include <arm_neon.h>
119 static inline void volk_32fc_x2_square_dist_32f_neon(
float* target,
lv_32fc_t* src0,
lv_32fc_t* points,
unsigned int num_points) {
120 const unsigned int quarter_points = num_points / 4;
123 float32x4x2_t a_vec, b_vec;
124 float32x4x2_t diff_vec;
125 float32x4_t tmp, tmp1, dist_sq;
126 a_vec.val[0] = vdupq_n_f32(
lv_creal(src0[0]) );
127 a_vec.val[1] = vdupq_n_f32(
lv_cimag(src0[0]) );
128 for(number=0; number < quarter_points; ++number) {
129 b_vec = vld2q_f32((
float*)points);
130 diff_vec.val[0] = vsubq_f32(a_vec.val[0], b_vec.val[0]);
131 diff_vec.val[1] = vsubq_f32(a_vec.val[1], b_vec.val[1]);
132 tmp = vmulq_f32(diff_vec.val[0], diff_vec.val[0]);
133 tmp1 = vmulq_f32(diff_vec.val[1], diff_vec.val[1]);
135 dist_sq = vaddq_f32(tmp, tmp1);
136 vst1q_f32(target, dist_sq);
140 for(number=quarter_points*4; number < num_points; ++number) {
147 #ifdef LV_HAVE_GENERIC
148 static inline void volk_32fc_x2_square_dist_32f_generic(
float* target,
lv_32fc_t* src0,
lv_32fc_t* points,
unsigned int num_points) {
150 const unsigned int num_bytes = num_points*8;
156 for(; i < num_bytes >> 3; ++i) {
157 diff = src0[0] - points[i];
float complex lv_32fc_t
Definition: volk_complex.h:56
#define lv_creal(x)
Definition: volk_complex.h:76
#define lv_cimag(x)
Definition: volk_complex.h:78