23 #ifndef INCLUDED_volk_32fc_index_max_16u_a_H
24 #define INCLUDED_volk_32fc_index_max_16u_a_H
36 static inline void volk_32fc_index_max_16u_a_sse3(
unsigned int* target,
lv_32fc_t* src0,
unsigned int num_points) {
38 const unsigned int num_bytes = num_points*8;
48 __m128 xmm1, xmm2, xmm3;
49 __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
51 xmm5.int_vec = xmmfive = _mm_setzero_si128();
52 xmm4.int_vec = xmmfour = _mm_setzero_si128();
53 holderf.int_vec = holder0 = _mm_setzero_si128();
54 holderi.int_vec = holder1 = _mm_setzero_si128();
57 int bound = num_bytes >> 5;
58 int leftovers0 = (num_bytes >> 4) & 1;
59 int leftovers1 = (num_bytes >> 3) & 1;
63 xmm8 = _mm_set_epi32(3, 2, 1, 0);
64 xmm9 = xmm8 = _mm_setzero_si128();
65 xmm10 = _mm_set_epi32(4, 4, 4, 4);
66 xmm3 = _mm_setzero_ps();
71 for(; i < bound; ++
i) {
73 xmm1 = _mm_load_ps((
float*)src0);
74 xmm2 = _mm_load_ps((
float*)&src0[2]);
80 xmm1 = _mm_mul_ps(xmm1, xmm1);
81 xmm2 = _mm_mul_ps(xmm2, xmm2);
84 xmm1 = _mm_hadd_ps(xmm1, xmm2);
86 xmm3 = _mm_max_ps(xmm1, xmm3);
88 xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
89 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
93 xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
94 xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
96 xmm9 = _mm_add_epi32(xmm11, xmm12);
98 xmm8 = _mm_add_epi32(xmm8, xmm10);
107 for(i = 0; i < leftovers0; ++
i) {
110 xmm2 = _mm_load_ps((
float*)src0);
112 xmm1 = _mm_movelh_ps(
bit128_p(&xmm8)->float_vec,
bit128_p(&xmm8)->float_vec);
115 xmm2 = _mm_mul_ps(xmm2, xmm2);
119 xmm1 = _mm_hadd_ps(xmm2, xmm2);
121 xmm3 = _mm_max_ps(xmm1, xmm3);
123 xmm10 = _mm_set_epi32(2, 2, 2, 2);
126 xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
127 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
131 xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
132 xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
134 xmm9 = _mm_add_epi32(xmm11, xmm12);
136 xmm8 = _mm_add_epi32(xmm8, xmm10);
144 for(i = 0; i < leftovers1; ++
i) {
150 xmm2 = _mm_load1_ps(&sq_dist);
154 xmm3 = _mm_max_ss(xmm3, xmm2);
158 xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
159 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
162 xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
164 xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
165 xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
168 xmm9 = _mm_add_epi32(xmm11, xmm12);
176 _mm_store_ps((
float*)&(holderf.f), xmm3);
177 _mm_store_si128(&(holderi.int_vec), xmm9);
179 target[0] = holderi.i[0];
180 sq_dist = holderf.f[0];
181 target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
182 sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
183 target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
184 sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
185 target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
186 sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
213 #ifdef LV_HAVE_GENERIC
214 static inline void volk_32fc_index_max_16u_generic(
unsigned int* target,
lv_32fc_t* src0,
unsigned int num_points) {
216 const unsigned int num_bytes = num_points*8;
220 unsigned int index = 0;
224 for(; i < num_bytes >> 3; ++
i) {
228 index = sq_dist > max ? i : index;
229 max = sq_dist > max ? sq_dist : max;
#define bit128_p(x)
Definition: volk_common.h:94
float complex lv_32fc_t
Definition: volk_complex.h:56
#define lv_creal(x)
Definition: volk_complex.h:76
Definition: volk_common.h:78
#define lv_cimag(x)
Definition: volk_complex.h:78
uint32_t i[4]
Definition: volk_common.h:80