GNU Radio Manual and C++ API Reference  3.7.6.1
The Free & Open Software Radio Ecosystem
volk_32f_x3_sum_of_poly_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 #ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
24 #define INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
25 
26 #include<inttypes.h>
27 #include<stdio.h>
28 #include<volk/volk_complex.h>
29 
30 #ifndef MAX
31 #define MAX(X,Y) ((X) > (Y)?(X):(Y))
32 #endif
33 
34 #ifdef LV_HAVE_SSE3
35 #include<xmmintrin.h>
36 #include<pmmintrin.h>
37 
38 static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_points) {
39 
40  const unsigned int num_bytes = num_points*4;
41 
42  float result = 0.0;
43  float fst = 0.0;
44  float sq = 0.0;
45  float thrd = 0.0;
46  float frth = 0.0;
47  //float fith = 0.0;
48 
49 
50 
51  __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;// xmm11, xmm12;
52 
53  xmm9 = _mm_setzero_ps();
54  xmm1 = _mm_setzero_ps();
55 
56  xmm0 = _mm_load1_ps(&center_point_array[0]);
57  xmm6 = _mm_load1_ps(&center_point_array[1]);
58  xmm7 = _mm_load1_ps(&center_point_array[2]);
59  xmm8 = _mm_load1_ps(&center_point_array[3]);
60  //xmm11 = _mm_load1_ps(&center_point_array[4]);
61  xmm10 = _mm_load1_ps(cutoff);
62 
63  int bound = num_bytes >> 4;
64  int leftovers = (num_bytes >> 2) & 3;
65  int i = 0;
66 
67  for(; i < bound; ++i) {
68  xmm2 = _mm_load_ps(src0);
69  xmm2 = _mm_max_ps(xmm10, xmm2);
70  xmm3 = _mm_mul_ps(xmm2, xmm2);
71  xmm4 = _mm_mul_ps(xmm2, xmm3);
72  xmm5 = _mm_mul_ps(xmm3, xmm3);
73  //xmm12 = _mm_mul_ps(xmm3, xmm4);
74 
75  xmm2 = _mm_mul_ps(xmm2, xmm0);
76  xmm3 = _mm_mul_ps(xmm3, xmm6);
77  xmm4 = _mm_mul_ps(xmm4, xmm7);
78  xmm5 = _mm_mul_ps(xmm5, xmm8);
79  //xmm12 = _mm_mul_ps(xmm12, xmm11);
80 
81  xmm2 = _mm_add_ps(xmm2, xmm3);
82  xmm3 = _mm_add_ps(xmm4, xmm5);
83 
84  src0 += 4;
85 
86  xmm9 = _mm_add_ps(xmm2, xmm9);
87 
88  xmm1 = _mm_add_ps(xmm3, xmm1);
89 
90  //xmm9 = _mm_add_ps(xmm12, xmm9);
91  }
92 
93  xmm2 = _mm_hadd_ps(xmm9, xmm1);
94  xmm3 = _mm_hadd_ps(xmm2, xmm2);
95  xmm4 = _mm_hadd_ps(xmm3, xmm3);
96 
97  _mm_store_ss(&result, xmm4);
98 
99 
100 
101  for(i = 0; i < leftovers; ++i) {
102  fst = src0[i];
103  fst = MAX(fst, *cutoff);
104  sq = fst * fst;
105  thrd = fst * sq;
106  frth = sq * sq;
107  //fith = sq * thrd;
108 
109  result += (center_point_array[0] * fst +
110  center_point_array[1] * sq +
111  center_point_array[2] * thrd +
112  center_point_array[3] * frth);// +
113  //center_point_array[4] * fith);
114  }
115 
116  result += ((float)((bound * 4) + leftovers)) * center_point_array[4]; //center_point_array[5];
117 
118  target[0] = result;
119 }
120 
121 
122 #endif /*LV_HAVE_SSE3*/
123 
124 
125 #ifdef LV_HAVE_AVX
126 #include<immintrin.h>
127 
128 static inline void volk_32f_x3_sum_of_poly_32f_a_avx(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_points)
129 {
130  const unsigned int eighth_points = num_points / 8;
131  float fst = 0.0;
132  float sq = 0.0;
133  float thrd = 0.0;
134  float frth = 0.0;
135 
136  __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
137  __m256 target_vec;
138  __m256 x_to_1, x_to_2, x_to_3, x_to_4;
139 
140  cpa0 = _mm256_set1_ps(center_point_array[0]);
141  cpa1 = _mm256_set1_ps(center_point_array[1]);
142  cpa2 = _mm256_set1_ps(center_point_array[2]);
143  cpa3 = _mm256_set1_ps(center_point_array[3]);
144  cutoff_vec = _mm256_set1_ps(*cutoff);
145  target_vec = _mm256_setzero_ps();
146 
147  unsigned int i;
148 
149  for(i = 0; i < eighth_points; ++i) {
150  x_to_1 = _mm256_load_ps(src0);
151  x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
152  x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
153  x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
154  // x^1 * x^3 is slightly faster than x^2 * x^2
155  x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
156 
157  x_to_1 = _mm256_mul_ps(x_to_1, cpa0); // cpa[0] * x^1
158  x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
159  x_to_3 = _mm256_mul_ps(x_to_3, cpa2); // cpa[2] * x^3
160  x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
161 
162  x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
163  x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
164  // this is slightly faster than result += (x_to_1 + x_to_3)
165  target_vec = _mm256_add_ps(x_to_1, target_vec);
166  target_vec = _mm256_add_ps(x_to_3, target_vec);
167 
168  src0 += 8;
169  }
170 
171  // the hadd for vector reduction has very very slight impact @ 50k iters
172  __VOLK_ATTR_ALIGNED(32) float temp_results[8];
173  target_vec = _mm256_hadd_ps(target_vec, target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
174  _mm256_store_ps(temp_results, target_vec);
175  *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
176 
177 
178  for(i = eighth_points*8; i < num_points; ++i) {
179  fst = *(src0++);
180  fst = MAX(fst, *cutoff);
181  sq = fst * fst;
182  thrd = fst * sq;
183  frth = sq * sq;
184 
185  *target += (center_point_array[0] * fst +
186  center_point_array[1] * sq +
187  center_point_array[2] * thrd +
188  center_point_array[3] * frth);
189  }
190 
191  *target += ((float)(num_points)) * center_point_array[4];
192 
193 }
194 #endif // LV_HAVE_AVX
195 
196 
197 #ifdef LV_HAVE_GENERIC
198 
199 static inline void volk_32f_x3_sum_of_poly_32f_generic(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_points) {
200 
201  const unsigned int num_bytes = num_points*4;
202 
203  float result = 0.0;
204  float fst = 0.0;
205  float sq = 0.0;
206  float thrd = 0.0;
207  float frth = 0.0;
208  //float fith = 0.0;
209 
210 
211 
212  unsigned int i = 0;
213 
214  for(; i < num_bytes >> 2; ++i) {
215  fst = src0[i];
216  fst = MAX(fst, *cutoff);
217 
218  sq = fst * fst;
219  thrd = fst * sq;
220  frth = sq * sq;
221  //fith = sq * thrd;
222 
223  result += (center_point_array[0] * fst +
224  center_point_array[1] * sq +
225  center_point_array[2] * thrd +
226  center_point_array[3] * frth); //+
227  //center_point_array[4] * fith);
228  /*printf("%f12...%d\n", (center_point_array[0] * fst +
229  center_point_array[1] * sq +
230  center_point_array[2] * thrd +
231  center_point_array[3] * frth) +
232  //center_point_array[4] * fith) +
233  (center_point_array[4]), i);
234  */
235  }
236 
237  result += ((float)(num_bytes >> 2)) * (center_point_array[4]);//(center_point_array[5]);
238 
239 
240 
241  *target = result;
242 }
243 
244 #endif /*LV_HAVE_GENERIC*/
245 
246 
247 #ifdef LV_HAVE_AVX
248 #include<immintrin.h>
249 
250 static inline void volk_32f_x3_sum_of_poly_32f_u_avx(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_points)
251 {
252  const unsigned int eighth_points = num_points / 8;
253  float fst = 0.0;
254  float sq = 0.0;
255  float thrd = 0.0;
256  float frth = 0.0;
257 
258  __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
259  __m256 target_vec;
260  __m256 x_to_1, x_to_2, x_to_3, x_to_4;
261 
262  cpa0 = _mm256_set1_ps(center_point_array[0]);
263  cpa1 = _mm256_set1_ps(center_point_array[1]);
264  cpa2 = _mm256_set1_ps(center_point_array[2]);
265  cpa3 = _mm256_set1_ps(center_point_array[3]);
266  cutoff_vec = _mm256_set1_ps(*cutoff);
267  target_vec = _mm256_setzero_ps();
268 
269  unsigned int i;
270 
271  for(i = 0; i < eighth_points; ++i) {
272  x_to_1 = _mm256_loadu_ps(src0);
273  x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
274  x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
275  x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
276  // x^1 * x^3 is slightly faster than x^2 * x^2
277  x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
278 
279  x_to_1 = _mm256_mul_ps(x_to_1, cpa0); // cpa[0] * x^1
280  x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
281  x_to_3 = _mm256_mul_ps(x_to_3, cpa2); // cpa[2] * x^3
282  x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
283 
284  x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
285  x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
286  // this is slightly faster than result += (x_to_1 + x_to_3)
287  target_vec = _mm256_add_ps(x_to_1, target_vec);
288  target_vec = _mm256_add_ps(x_to_3, target_vec);
289 
290  src0 += 8;
291  }
292 
293  // the hadd for vector reduction has very very slight impact @ 50k iters
294  __VOLK_ATTR_ALIGNED(32) float temp_results[8];
295  target_vec = _mm256_hadd_ps(target_vec, target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
296  _mm256_store_ps(temp_results, target_vec);
297  *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
298 
299 
300  for(i = eighth_points*8; i < num_points; ++i) {
301  fst = *(src0++);
302  fst = MAX(fst, *cutoff);
303  sq = fst * fst;
304  thrd = fst * sq;
305  frth = sq * sq;
306 
307  *target += (center_point_array[0] * fst +
308  center_point_array[1] * sq +
309  center_point_array[2] * thrd +
310  center_point_array[3] * frth);
311  }
312 
313  *target += ((float)(num_points)) * center_point_array[4];
314 
315 }
316 #endif // LV_HAVE_AVX
317 
318 #ifdef LV_HAVE_NEON
319 #include <arm_neon.h>
320 
321 static inline void volk_32f_x3_sum_of_poly_32f_a_neon(float* __restrict target, float* __restrict src0, float* __restrict center_point_array, float* __restrict cutoff, unsigned int num_points) {
322 
323 
324  unsigned int i;
325  float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
326 
327  float32x2_t x_to_1, x_to_2, x_to_3, x_to_4;
328  float32x2_t cutoff_vector;
329  float32x2x2_t x_low, x_high;
330  float32x4_t x_qvector, c_qvector, cpa_qvector;
331  float accumulator;
332  float res_accumulators[4];
333 
334  c_qvector = vld1q_f32( zero );
335  // load the cutoff in to a vector
336  cutoff_vector = vdup_n_f32( *cutoff );
337  // ... center point array
338  cpa_qvector = vld1q_f32( center_point_array );
339 
340  for(i=0; i < num_points; ++i) {
341  // load x (src0)
342  x_to_1 = vdup_n_f32( *src0++ );
343 
344  // Get a vector of max(src0, cutoff)
345  x_to_1 = vmax_f32(x_to_1, cutoff_vector ); // x^1
346  x_to_2 = vmul_f32(x_to_1, x_to_1); // x^2
347  x_to_3 = vmul_f32(x_to_2, x_to_1); // x^3
348  x_to_4 = vmul_f32(x_to_3, x_to_1); // x^4
349  // zip up doubles to interleave
350  x_low = vzip_f32(x_to_1, x_to_2); // [x^2 | x^1 || x^2 | x^1]
351  x_high = vzip_f32(x_to_3, x_to_4); // [x^4 | x^3 || x^4 | x^3]
352  // float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0
353  x_qvector = vcombine_f32(x_low.val[0], x_high.val[0]);
354  // now we finally have [x^4 | x^3 | x^2 | x] !
355 
356  c_qvector = vmlaq_f32(c_qvector, x_qvector, cpa_qvector);
357 
358  }
359  // there should be better vector reduction techniques
360  vst1q_f32(res_accumulators, c_qvector );
361  accumulator = res_accumulators[0] + res_accumulators[1] +
362  res_accumulators[2] + res_accumulators[3];
363 
364  *target = accumulator + center_point_array[4] * (float)num_points;
365 }
366 
367 #endif /* LV_HAVE_NEON */
368 
369 #ifdef LV_HAVE_NEON
370 
371 static inline void volk_32f_x3_sum_of_poly_32f_neonvert(float* __restrict target, float* __restrict src0, float* __restrict center_point_array, float* __restrict cutoff, unsigned int num_points) {
372 
373 
374  unsigned int i;
375  float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
376 
377  float accumulator;
378 
379 
380  float32x4_t accumulator1_vec, accumulator2_vec, accumulator3_vec, accumulator4_vec;
381  accumulator1_vec = vld1q_f32(zero);
382  accumulator2_vec = vld1q_f32(zero);
383  accumulator3_vec = vld1q_f32(zero);
384  accumulator4_vec = vld1q_f32(zero);
385  float32x4_t x_to_1, x_to_2, x_to_3, x_to_4;
386  float32x4_t cutoff_vector, cpa_0, cpa_1, cpa_2, cpa_3;
387 
388  // load the cutoff in to a vector
389  cutoff_vector = vdupq_n_f32( *cutoff );
390  // ... center point array
391  cpa_0 = vdupq_n_f32(center_point_array[0]);
392  cpa_1 = vdupq_n_f32(center_point_array[1]);
393  cpa_2 = vdupq_n_f32(center_point_array[2]);
394  cpa_3 = vdupq_n_f32(center_point_array[3]);
395 
396 
397  // nathan is not sure why this is slower *and* wrong compared to neonvertfma
398  for(i=0; i < num_points/4; ++i) {
399  // load x
400  x_to_1 = vld1q_f32( src0 );
401 
402  // Get a vector of max(src0, cutoff)
403  x_to_1 = vmaxq_f32(x_to_1, cutoff_vector ); // x^1
404  x_to_2 = vmulq_f32(x_to_1, x_to_1); // x^2
405  x_to_3 = vmulq_f32(x_to_2, x_to_1); // x^3
406  x_to_4 = vmulq_f32(x_to_3, x_to_1); // x^4
407  x_to_1 = vmulq_f32(x_to_1, cpa_0);
408  x_to_2 = vmulq_f32(x_to_2, cpa_1);
409  x_to_3 = vmulq_f32(x_to_3, cpa_2);
410  x_to_4 = vmulq_f32(x_to_4, cpa_3);
411  accumulator1_vec = vaddq_f32(accumulator1_vec, x_to_1);
412  accumulator2_vec = vaddq_f32(accumulator2_vec, x_to_2);
413  accumulator3_vec = vaddq_f32(accumulator3_vec, x_to_3);
414  accumulator4_vec = vaddq_f32(accumulator4_vec, x_to_4);
415 
416  src0 += 4;
417  }
418  accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator2_vec);
419  accumulator3_vec = vaddq_f32(accumulator3_vec, accumulator4_vec);
420  accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator3_vec);
421 
422  __VOLK_ATTR_ALIGNED(32) float res_accumulators[4];
423  vst1q_f32(res_accumulators, accumulator1_vec );
424  accumulator = res_accumulators[0] + res_accumulators[1] +
425  res_accumulators[2] + res_accumulators[3];
426 
427  float fst = 0.0;
428  float sq = 0.0;
429  float thrd = 0.0;
430  float frth = 0.0;
431 
432  for(i = 4*num_points/4; i < num_points; ++i) {
433  fst = src0[i];
434  fst = MAX(fst, *cutoff);
435 
436  sq = fst * fst;
437  thrd = fst * sq;
438  frth = sq * sq;
439  //fith = sq * thrd;
440 
441  accumulator += (center_point_array[0] * fst +
442  center_point_array[1] * sq +
443  center_point_array[2] * thrd +
444  center_point_array[3] * frth); //+
445  }
446 
447  *target = accumulator + center_point_array[4] * (float)num_points;
448 }
449 
450 #endif /* LV_HAVE_NEON */
451 
452 #endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H*/
#define MAX(X, Y)
Definition: volk_32f_x3_sum_of_poly_32f.h:31
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:27