23 #ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
24 #define INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
31 #define MAX(X,Y) ((X) > (Y)?(X):(Y))
38 static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(
float* target,
float* src0,
float* center_point_array,
float* cutoff,
unsigned int num_points) {
40 const unsigned int num_bytes = num_points*4;
51 __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
53 xmm9 = _mm_setzero_ps();
54 xmm1 = _mm_setzero_ps();
56 xmm0 = _mm_load1_ps(¢er_point_array[0]);
57 xmm6 = _mm_load1_ps(¢er_point_array[1]);
58 xmm7 = _mm_load1_ps(¢er_point_array[2]);
59 xmm8 = _mm_load1_ps(¢er_point_array[3]);
61 xmm10 = _mm_load1_ps(cutoff);
63 int bound = num_bytes >> 4;
64 int leftovers = (num_bytes >> 2) & 3;
67 for(; i < bound; ++i) {
68 xmm2 = _mm_load_ps(src0);
69 xmm2 = _mm_max_ps(xmm10, xmm2);
70 xmm3 = _mm_mul_ps(xmm2, xmm2);
71 xmm4 = _mm_mul_ps(xmm2, xmm3);
72 xmm5 = _mm_mul_ps(xmm3, xmm3);
75 xmm2 = _mm_mul_ps(xmm2, xmm0);
76 xmm3 = _mm_mul_ps(xmm3, xmm6);
77 xmm4 = _mm_mul_ps(xmm4, xmm7);
78 xmm5 = _mm_mul_ps(xmm5, xmm8);
81 xmm2 = _mm_add_ps(xmm2, xmm3);
82 xmm3 = _mm_add_ps(xmm4, xmm5);
86 xmm9 = _mm_add_ps(xmm2, xmm9);
88 xmm1 = _mm_add_ps(xmm3, xmm1);
93 xmm2 = _mm_hadd_ps(xmm9, xmm1);
94 xmm3 = _mm_hadd_ps(xmm2, xmm2);
95 xmm4 = _mm_hadd_ps(xmm3, xmm3);
97 _mm_store_ss(&result, xmm4);
101 for(i = 0; i < leftovers; ++i) {
103 fst =
MAX(fst, *cutoff);
109 result += (center_point_array[0] * fst +
110 center_point_array[1] * sq +
111 center_point_array[2] * thrd +
112 center_point_array[3] * frth);
116 result += ((float)((bound * 4) + leftovers)) * center_point_array[4];
126 #include<immintrin.h>
128 static inline void volk_32f_x3_sum_of_poly_32f_a_avx(
float* target,
float* src0,
float* center_point_array,
float* cutoff,
unsigned int num_points)
130 const unsigned int eighth_points = num_points / 8;
136 __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
138 __m256 x_to_1, x_to_2, x_to_3, x_to_4;
140 cpa0 = _mm256_set1_ps(center_point_array[0]);
141 cpa1 = _mm256_set1_ps(center_point_array[1]);
142 cpa2 = _mm256_set1_ps(center_point_array[2]);
143 cpa3 = _mm256_set1_ps(center_point_array[3]);
144 cutoff_vec = _mm256_set1_ps(*cutoff);
145 target_vec = _mm256_setzero_ps();
149 for(i = 0; i < eighth_points; ++i) {
150 x_to_1 = _mm256_load_ps(src0);
151 x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
152 x_to_2 = _mm256_mul_ps(x_to_1, x_to_1);
153 x_to_3 = _mm256_mul_ps(x_to_1, x_to_2);
155 x_to_4 = _mm256_mul_ps(x_to_1, x_to_3);
157 x_to_1 = _mm256_mul_ps(x_to_1, cpa0);
158 x_to_2 = _mm256_mul_ps(x_to_2, cpa1);
159 x_to_3 = _mm256_mul_ps(x_to_3, cpa2);
160 x_to_4 = _mm256_mul_ps(x_to_4, cpa3);
162 x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
163 x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
165 target_vec = _mm256_add_ps(x_to_1, target_vec);
166 target_vec = _mm256_add_ps(x_to_3, target_vec);
173 target_vec = _mm256_hadd_ps(target_vec, target_vec);
174 _mm256_store_ps(temp_results, target_vec);
175 *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
178 for(i = eighth_points*8; i < num_points; ++i) {
180 fst =
MAX(fst, *cutoff);
185 *target += (center_point_array[0] * fst +
186 center_point_array[1] * sq +
187 center_point_array[2] * thrd +
188 center_point_array[3] * frth);
191 *target += ((float)(num_points)) * center_point_array[4];
194 #endif // LV_HAVE_AVX
197 #ifdef LV_HAVE_GENERIC
199 static inline void volk_32f_x3_sum_of_poly_32f_generic(
float* target,
float* src0,
float* center_point_array,
float* cutoff,
unsigned int num_points) {
201 const unsigned int num_bytes = num_points*4;
214 for(; i < num_bytes >> 2; ++i) {
216 fst =
MAX(fst, *cutoff);
223 result += (center_point_array[0] * fst +
224 center_point_array[1] * sq +
225 center_point_array[2] * thrd +
226 center_point_array[3] * frth);
237 result += ((float)(num_bytes >> 2)) * (center_point_array[4]);
248 #include<immintrin.h>
250 static inline void volk_32f_x3_sum_of_poly_32f_u_avx(
float* target,
float* src0,
float* center_point_array,
float* cutoff,
unsigned int num_points)
252 const unsigned int eighth_points = num_points / 8;
258 __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
260 __m256 x_to_1, x_to_2, x_to_3, x_to_4;
262 cpa0 = _mm256_set1_ps(center_point_array[0]);
263 cpa1 = _mm256_set1_ps(center_point_array[1]);
264 cpa2 = _mm256_set1_ps(center_point_array[2]);
265 cpa3 = _mm256_set1_ps(center_point_array[3]);
266 cutoff_vec = _mm256_set1_ps(*cutoff);
267 target_vec = _mm256_setzero_ps();
271 for(i = 0; i < eighth_points; ++i) {
272 x_to_1 = _mm256_loadu_ps(src0);
273 x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
274 x_to_2 = _mm256_mul_ps(x_to_1, x_to_1);
275 x_to_3 = _mm256_mul_ps(x_to_1, x_to_2);
277 x_to_4 = _mm256_mul_ps(x_to_1, x_to_3);
279 x_to_1 = _mm256_mul_ps(x_to_1, cpa0);
280 x_to_2 = _mm256_mul_ps(x_to_2, cpa1);
281 x_to_3 = _mm256_mul_ps(x_to_3, cpa2);
282 x_to_4 = _mm256_mul_ps(x_to_4, cpa3);
284 x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
285 x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
287 target_vec = _mm256_add_ps(x_to_1, target_vec);
288 target_vec = _mm256_add_ps(x_to_3, target_vec);
295 target_vec = _mm256_hadd_ps(target_vec, target_vec);
296 _mm256_store_ps(temp_results, target_vec);
297 *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
300 for(i = eighth_points*8; i < num_points; ++i) {
302 fst =
MAX(fst, *cutoff);
307 *target += (center_point_array[0] * fst +
308 center_point_array[1] * sq +
309 center_point_array[2] * thrd +
310 center_point_array[3] * frth);
313 *target += ((float)(num_points)) * center_point_array[4];
316 #endif // LV_HAVE_AVX
319 #include <arm_neon.h>
321 static inline void volk_32f_x3_sum_of_poly_32f_a_neon(
float* __restrict target,
float* __restrict src0,
float* __restrict center_point_array,
float* __restrict cutoff,
unsigned int num_points) {
325 float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
327 float32x2_t x_to_1, x_to_2, x_to_3, x_to_4;
328 float32x2_t cutoff_vector;
329 float32x2x2_t x_low, x_high;
330 float32x4_t x_qvector, c_qvector, cpa_qvector;
332 float res_accumulators[4];
334 c_qvector = vld1q_f32( zero );
336 cutoff_vector = vdup_n_f32( *cutoff );
338 cpa_qvector = vld1q_f32( center_point_array );
340 for(i=0; i < num_points; ++i) {
342 x_to_1 = vdup_n_f32( *src0++ );
345 x_to_1 = vmax_f32(x_to_1, cutoff_vector );
346 x_to_2 = vmul_f32(x_to_1, x_to_1);
347 x_to_3 = vmul_f32(x_to_2, x_to_1);
348 x_to_4 = vmul_f32(x_to_3, x_to_1);
350 x_low = vzip_f32(x_to_1, x_to_2);
351 x_high = vzip_f32(x_to_3, x_to_4);
353 x_qvector = vcombine_f32(x_low.val[0], x_high.val[0]);
356 c_qvector = vmlaq_f32(c_qvector, x_qvector, cpa_qvector);
360 vst1q_f32(res_accumulators, c_qvector );
361 accumulator = res_accumulators[0] + res_accumulators[1] +
362 res_accumulators[2] + res_accumulators[3];
364 *target = accumulator + center_point_array[4] * (float)num_points;
371 static inline void volk_32f_x3_sum_of_poly_32f_neonvert(
float* __restrict target,
float* __restrict src0,
float* __restrict center_point_array,
float* __restrict cutoff,
unsigned int num_points) {
375 float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
380 float32x4_t accumulator1_vec, accumulator2_vec, accumulator3_vec, accumulator4_vec;
381 accumulator1_vec = vld1q_f32(zero);
382 accumulator2_vec = vld1q_f32(zero);
383 accumulator3_vec = vld1q_f32(zero);
384 accumulator4_vec = vld1q_f32(zero);
385 float32x4_t x_to_1, x_to_2, x_to_3, x_to_4;
386 float32x4_t cutoff_vector, cpa_0, cpa_1, cpa_2, cpa_3;
389 cutoff_vector = vdupq_n_f32( *cutoff );
391 cpa_0 = vdupq_n_f32(center_point_array[0]);
392 cpa_1 = vdupq_n_f32(center_point_array[1]);
393 cpa_2 = vdupq_n_f32(center_point_array[2]);
394 cpa_3 = vdupq_n_f32(center_point_array[3]);
398 for(i=0; i < num_points/4; ++i) {
400 x_to_1 = vld1q_f32( src0 );
403 x_to_1 = vmaxq_f32(x_to_1, cutoff_vector );
404 x_to_2 = vmulq_f32(x_to_1, x_to_1);
405 x_to_3 = vmulq_f32(x_to_2, x_to_1);
406 x_to_4 = vmulq_f32(x_to_3, x_to_1);
407 x_to_1 = vmulq_f32(x_to_1, cpa_0);
408 x_to_2 = vmulq_f32(x_to_2, cpa_1);
409 x_to_3 = vmulq_f32(x_to_3, cpa_2);
410 x_to_4 = vmulq_f32(x_to_4, cpa_3);
411 accumulator1_vec = vaddq_f32(accumulator1_vec, x_to_1);
412 accumulator2_vec = vaddq_f32(accumulator2_vec, x_to_2);
413 accumulator3_vec = vaddq_f32(accumulator3_vec, x_to_3);
414 accumulator4_vec = vaddq_f32(accumulator4_vec, x_to_4);
418 accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator2_vec);
419 accumulator3_vec = vaddq_f32(accumulator3_vec, accumulator4_vec);
420 accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator3_vec);
423 vst1q_f32(res_accumulators, accumulator1_vec );
424 accumulator = res_accumulators[0] + res_accumulators[1] +
425 res_accumulators[2] + res_accumulators[3];
432 for(i = 4*num_points/4; i < num_points; ++i) {
434 fst =
MAX(fst, *cutoff);
441 accumulator += (center_point_array[0] * fst +
442 center_point_array[1] * sq +
443 center_point_array[2] * thrd +
444 center_point_array[3] * frth);
447 *target = accumulator + center_point_array[4] * (float)num_points;
#define MAX(X, Y)
Definition: volk_32f_x3_sum_of_poly_32f.h:31
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:27