23 #ifndef INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
24 #define INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
30 #define ROTATOR_RELOAD 512
33 #ifdef LV_HAVE_GENERIC
43 static inline void volk_32fc_s32fc_x2_rotator_32fc_generic(
lv_32fc_t* outVector,
const lv_32fc_t* inVector,
const lv_32fc_t phase_inc,
lv_32fc_t* phase,
unsigned int num_points){
48 *outVector++ = *inVector++ * (*phase);
49 (*phase) *= phase_inc;
52 (*phase) /= std::abs((*phase));
54 (*phase) /= cabsf((*phase));
58 *outVector++ = *inVector++ * (*phase);
59 (*phase) *= phase_inc;
68 #include <smmintrin.h>
70 static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(
lv_32fc_t* outVector,
const lv_32fc_t* inVector,
const lv_32fc_t phase_inc,
lv_32fc_t* phase,
unsigned int num_points){
74 lv_32fc_t phase_Ptr[2] = {(*phase), (*phase)};
76 unsigned int i, j = 0;
78 for(i = 0; i < 2; ++
i) {
86 __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
88 phase_Val = _mm_loadu_ps((
float*)phase_Ptr);
91 const unsigned int halfPoints = num_points / 2;
97 aVal = _mm_load_ps((
float*)aPtr);
99 yl = _mm_moveldup_ps(phase_Val);
100 yh = _mm_movehdup_ps(phase_Val);
101 ylp = _mm_moveldup_ps(inc_Val);
102 yhp = _mm_movehdup_ps(inc_Val);
104 tmp1 = _mm_mul_ps(aVal, yl);
105 tmp1p = _mm_mul_ps(phase_Val, ylp);
107 aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
108 phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
109 tmp2 = _mm_mul_ps(aVal, yh);
110 tmp2p = _mm_mul_ps(phase_Val, yhp);
112 z = _mm_addsub_ps(tmp1, tmp2);
113 phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
115 _mm_store_ps((
float*)cPtr, z);
120 tmp1 = _mm_mul_ps(phase_Val, phase_Val);
121 tmp2 = _mm_hadd_ps(tmp1, tmp1);
122 tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
123 tmp2 = _mm_sqrt_ps(tmp1);
124 phase_Val = _mm_div_ps(phase_Val, tmp2);
127 aVal = _mm_load_ps((
float*)aPtr);
129 yl = _mm_moveldup_ps(phase_Val);
130 yh = _mm_movehdup_ps(phase_Val);
131 ylp = _mm_moveldup_ps(inc_Val);
132 yhp = _mm_movehdup_ps(inc_Val);
134 tmp1 = _mm_mul_ps(aVal, yl);
136 tmp1p = _mm_mul_ps(phase_Val, ylp);
138 aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
139 phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
140 tmp2 = _mm_mul_ps(aVal, yh);
141 tmp2p = _mm_mul_ps(phase_Val, yhp);
143 z = _mm_addsub_ps(tmp1, tmp2);
144 phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
146 _mm_store_ps((
float*)cPtr, z);
152 _mm_storeu_ps((
float*)phase_Ptr, phase_Val);
153 for(i = 0; i < num_points%2; ++
i) {
154 *cPtr++ = *aPtr++ * phase_Ptr[0];
155 phase_Ptr[0] *= (phase_inc);
158 (*phase) = phase_Ptr[0];
165 #ifdef LV_HAVE_SSE4_1
166 #include <smmintrin.h>
176 static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(
lv_32fc_t* outVector,
const lv_32fc_t* inVector,
const lv_32fc_t phase_inc,
lv_32fc_t* phase,
unsigned int num_points){
180 lv_32fc_t phase_Ptr[2] = {(*phase), (*phase)};
182 unsigned int i, j = 0;
184 for(i = 0; i < 2; ++
i) {
185 phase_Ptr[
i] *= incr;
192 __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
194 phase_Val = _mm_loadu_ps((
float*)phase_Ptr);
197 const unsigned int halfPoints = num_points / 2;
200 for(i = 0; i < (
unsigned int)(halfPoints/ROTATOR_RELOAD); i++) {
203 aVal = _mm_loadu_ps((
float*)aPtr);
205 yl = _mm_moveldup_ps(phase_Val);
206 yh = _mm_movehdup_ps(phase_Val);
207 ylp = _mm_moveldup_ps(inc_Val);
208 yhp = _mm_movehdup_ps(inc_Val);
210 tmp1 = _mm_mul_ps(aVal, yl);
211 tmp1p = _mm_mul_ps(phase_Val, ylp);
213 aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
214 phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
215 tmp2 = _mm_mul_ps(aVal, yh);
216 tmp2p = _mm_mul_ps(phase_Val, yhp);
218 z = _mm_addsub_ps(tmp1, tmp2);
219 phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
221 _mm_storeu_ps((
float*)cPtr, z);
226 tmp1 = _mm_mul_ps(phase_Val, phase_Val);
227 tmp2 = _mm_hadd_ps(tmp1, tmp1);
228 tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
229 tmp2 = _mm_sqrt_ps(tmp1);
230 phase_Val = _mm_div_ps(phase_Val, tmp2);
233 aVal = _mm_loadu_ps((
float*)aPtr);
235 yl = _mm_moveldup_ps(phase_Val);
236 yh = _mm_movehdup_ps(phase_Val);
237 ylp = _mm_moveldup_ps(inc_Val);
238 yhp = _mm_movehdup_ps(inc_Val);
240 tmp1 = _mm_mul_ps(aVal, yl);
242 tmp1p = _mm_mul_ps(phase_Val, ylp);
244 aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
245 phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
246 tmp2 = _mm_mul_ps(aVal, yh);
247 tmp2p = _mm_mul_ps(phase_Val, yhp);
249 z = _mm_addsub_ps(tmp1, tmp2);
250 phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
252 _mm_storeu_ps((
float*)cPtr, z);
258 _mm_storeu_ps((
float*)phase_Ptr, phase_Val);
259 for(i = 0; i < num_points%2; ++
i) {
260 *cPtr++ = *aPtr++ * phase_Ptr[0];
261 phase_Ptr[0] *= (phase_inc);
264 (*phase) = phase_Ptr[0];
272 #include <immintrin.h>
282 static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(
lv_32fc_t* outVector,
const lv_32fc_t* inVector,
const lv_32fc_t phase_inc,
lv_32fc_t* phase,
unsigned int num_points){
286 lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)};
288 unsigned int i, j = 0;
290 for(i = 0; i < 4; ++
i) {
291 phase_Ptr[
i] *= incr;
300 __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
302 phase_Val = _mm256_loadu_ps((
float*)phase_Ptr);
304 const unsigned int fourthPoints = num_points / 4;
307 for(i = 0; i < (
unsigned int)(fourthPoints/ROTATOR_RELOAD); i++) {
310 aVal = _mm256_load_ps((
float*)aPtr);
312 yl = _mm256_moveldup_ps(phase_Val);
313 yh = _mm256_movehdup_ps(phase_Val);
314 ylp = _mm256_moveldup_ps(inc_Val);
315 yhp = _mm256_movehdup_ps(inc_Val);
317 tmp1 = _mm256_mul_ps(aVal, yl);
318 tmp1p = _mm256_mul_ps(phase_Val, ylp);
320 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
321 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
322 tmp2 = _mm256_mul_ps(aVal, yh);
323 tmp2p = _mm256_mul_ps(phase_Val, yhp);
325 z = _mm256_addsub_ps(tmp1, tmp2);
326 phase_Val = _mm256_addsub_ps(tmp1p, tmp2p);
328 _mm256_store_ps((
float*)cPtr, z);
333 tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
334 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
335 tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
336 tmp2 = _mm256_sqrt_ps(tmp1);
337 phase_Val = _mm256_div_ps(phase_Val, tmp2);
340 aVal = _mm256_load_ps((
float*)aPtr);
342 yl = _mm256_moveldup_ps(phase_Val);
343 yh = _mm256_movehdup_ps(phase_Val);
344 ylp = _mm256_moveldup_ps(inc_Val);
345 yhp = _mm256_movehdup_ps(inc_Val);
347 tmp1 = _mm256_mul_ps(aVal, yl);
349 tmp1p = _mm256_mul_ps(phase_Val, ylp);
351 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
352 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
353 tmp2 = _mm256_mul_ps(aVal, yh);
354 tmp2p = _mm256_mul_ps(phase_Val, yhp);
356 z = _mm256_addsub_ps(tmp1, tmp2);
357 phase_Val = _mm256_addsub_ps(tmp1p, tmp2p);
359 _mm256_store_ps((
float*)cPtr, z);
365 _mm256_storeu_ps((
float*)phase_Ptr, phase_Val);
366 for(i = 0; i < num_points%4; ++
i) {
367 *cPtr++ = *aPtr++ * phase_Ptr[0];
368 phase_Ptr[0] *= (phase_inc);
371 (*phase) = phase_Ptr[0];
379 #include <immintrin.h>
389 static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx(
lv_32fc_t* outVector,
const lv_32fc_t* inVector,
const lv_32fc_t phase_inc,
lv_32fc_t* phase,
unsigned int num_points){
393 lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)};
395 unsigned int i, j = 0;
397 for(i = 0; i < 4; ++
i) {
398 phase_Ptr[
i] *= incr;
407 __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
409 phase_Val = _mm256_loadu_ps((
float*)phase_Ptr);
411 const unsigned int fourthPoints = num_points / 4;
414 for(i = 0; i < (
unsigned int)(fourthPoints/ROTATOR_RELOAD); i++) {
417 aVal = _mm256_loadu_ps((
float*)aPtr);
419 yl = _mm256_moveldup_ps(phase_Val);
420 yh = _mm256_movehdup_ps(phase_Val);
421 ylp = _mm256_moveldup_ps(inc_Val);
422 yhp = _mm256_movehdup_ps(inc_Val);
424 tmp1 = _mm256_mul_ps(aVal, yl);
425 tmp1p = _mm256_mul_ps(phase_Val, ylp);
427 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
428 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
429 tmp2 = _mm256_mul_ps(aVal, yh);
430 tmp2p = _mm256_mul_ps(phase_Val, yhp);
432 z = _mm256_addsub_ps(tmp1, tmp2);
433 phase_Val = _mm256_addsub_ps(tmp1p, tmp2p);
435 _mm256_storeu_ps((
float*)cPtr, z);
440 tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
441 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
442 tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
443 tmp2 = _mm256_sqrt_ps(tmp1);
444 phase_Val = _mm256_div_ps(phase_Val, tmp2);
447 aVal = _mm256_loadu_ps((
float*)aPtr);
449 yl = _mm256_moveldup_ps(phase_Val);
450 yh = _mm256_movehdup_ps(phase_Val);
451 ylp = _mm256_moveldup_ps(inc_Val);
452 yhp = _mm256_movehdup_ps(inc_Val);
454 tmp1 = _mm256_mul_ps(aVal, yl);
456 tmp1p = _mm256_mul_ps(phase_Val, ylp);
458 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
459 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
460 tmp2 = _mm256_mul_ps(aVal, yh);
461 tmp2p = _mm256_mul_ps(phase_Val, yhp);
463 z = _mm256_addsub_ps(tmp1, tmp2);
464 phase_Val = _mm256_addsub_ps(tmp1p, tmp2p);
466 _mm256_storeu_ps((
float*)cPtr, z);
472 _mm256_storeu_ps((
float*)phase_Ptr, phase_Val);
473 for(i = 0; i < num_points%4; ++
i) {
474 *cPtr++ = *aPtr++ * phase_Ptr[0];
475 phase_Ptr[0] *= (phase_inc);
478 (*phase) = phase_Ptr[0];
float complex lv_32fc_t
Definition: volk_complex.h:56
#define lv_creal(x)
Definition: volk_complex.h:76
#define ROTATOR_RELOAD
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:30
#define lv_cimag(x)
Definition: volk_complex.h:78
uint32_t i[4]
Definition: volk_common.h:80