23 #ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H
24 #define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H
32 #include <immintrin.h>
40 static inline void volk_32fc_x2_multiply_conjugate_32fc_u_avx(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t* bVector,
unsigned int num_points){
41 unsigned int number = 0;
42 const unsigned int quarterPoints = num_points / 4;
44 __m256 x, y, yl, yh, z, tmp1, tmp2;
49 __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
51 for(;number < quarterPoints; number++){
53 x = _mm256_loadu_ps((
float*)a);
54 y = _mm256_loadu_ps((
float*)b);
56 y = _mm256_xor_ps(y, conjugator);
58 yl = _mm256_moveldup_ps(y);
59 yh = _mm256_movehdup_ps(y);
61 tmp1 = _mm256_mul_ps(x,yl);
63 x = _mm256_shuffle_ps(x,x,0xB1);
65 tmp2 = _mm256_mul_ps(x,yh);
67 z = _mm256_addsub_ps(tmp1,tmp2);
69 _mm256_storeu_ps((
float*)c,z);
76 number = quarterPoints * 4;
78 for(; number < num_points; number++) {
85 #include <pmmintrin.h>
93 static inline void volk_32fc_x2_multiply_conjugate_32fc_u_sse3(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t* bVector,
unsigned int num_points){
94 unsigned int number = 0;
95 const unsigned int halfPoints = num_points / 2;
97 __m128 x, y, yl, yh, z, tmp1, tmp2;
102 __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
104 for(;number < halfPoints; number++){
106 x = _mm_loadu_ps((
float*)a);
107 y = _mm_loadu_ps((
float*)b);
109 y = _mm_xor_ps(y, conjugator);
111 yl = _mm_moveldup_ps(y);
112 yh = _mm_movehdup_ps(y);
114 tmp1 = _mm_mul_ps(x,yl);
116 x = _mm_shuffle_ps(x,x,0xB1);
118 tmp2 = _mm_mul_ps(x,yh);
120 z = _mm_addsub_ps(tmp1,tmp2);
122 _mm_storeu_ps((
float*)c,z);
129 if((num_points % 2) != 0) {
135 #ifdef LV_HAVE_GENERIC
143 static inline void volk_32fc_x2_multiply_conjugate_32fc_generic(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t* bVector,
unsigned int num_points){
147 unsigned int number = 0;
149 for(number = 0; number < num_points; number++){
150 *cPtr++ = (*aPtr++) *
lv_conj(*bPtr++);
157 #ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H
158 #define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H
166 #include <immintrin.h>
174 static inline void volk_32fc_x2_multiply_conjugate_32fc_a_avx(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t* bVector,
unsigned int num_points){
175 unsigned int number = 0;
176 const unsigned int quarterPoints = num_points / 4;
178 __m256 x, y, yl, yh, z, tmp1, tmp2;
183 __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
185 for(;number < quarterPoints; number++){
187 x = _mm256_load_ps((
float*)a);
188 y = _mm256_load_ps((
float*)b);
190 y = _mm256_xor_ps(y, conjugator);
192 yl = _mm256_moveldup_ps(y);
193 yh = _mm256_movehdup_ps(y);
195 tmp1 = _mm256_mul_ps(x,yl);
197 x = _mm256_shuffle_ps(x,x,0xB1);
199 tmp2 = _mm256_mul_ps(x,yh);
201 z = _mm256_addsub_ps(tmp1,tmp2);
203 _mm256_store_ps((
float*)c,z);
210 number = quarterPoints * 4;
212 for(; number < num_points; number++) {
219 #include <pmmintrin.h>
227 static inline void volk_32fc_x2_multiply_conjugate_32fc_a_sse3(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t* bVector,
unsigned int num_points){
228 unsigned int number = 0;
229 const unsigned int halfPoints = num_points / 2;
231 __m128 x, y, yl, yh, z, tmp1, tmp2;
236 __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
238 for(;number < halfPoints; number++){
240 x = _mm_load_ps((
float*)a);
241 y = _mm_load_ps((
float*)b);
243 y = _mm_xor_ps(y, conjugator);
245 yl = _mm_moveldup_ps(y);
246 yh = _mm_movehdup_ps(y);
248 tmp1 = _mm_mul_ps(x,yl);
250 x = _mm_shuffle_ps(x,x,0xB1);
252 tmp2 = _mm_mul_ps(x,yh);
254 z = _mm_addsub_ps(tmp1,tmp2);
256 _mm_store_ps((
float*)c,z);
263 if((num_points % 2) != 0) {
270 #include <arm_neon.h>
278 static inline void volk_32fc_x2_multiply_conjugate_32fc_neon(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t* bVector,
unsigned int num_points){
282 unsigned int quarter_points = num_points / 4;
283 float32x4x2_t a_val, b_val, c_val;
284 float32x4x2_t tmp_real, tmp_imag;
285 unsigned int number = 0;
287 for(number = 0; number < quarter_points; ++number) {
288 a_val = vld2q_f32((
float*)a_ptr);
289 b_val = vld2q_f32((
float*)b_ptr);
290 b_val.val[1] = vnegq_f32(b_val.val[1]);
291 __builtin_prefetch(a_ptr+4);
292 __builtin_prefetch(b_ptr+4);
296 tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
298 tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
302 tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
304 tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
307 c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
308 c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
309 vst2q_f32((
float*)cVector, c_val);
316 for(number = quarter_points*4; number < num_points; number++){
317 *cVector++ = (*a_ptr++) * conj(*b_ptr++);
322 #ifdef LV_HAVE_GENERIC
330 static inline void volk_32fc_x2_multiply_conjugate_32fc_a_generic(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t* bVector,
unsigned int num_points){
334 unsigned int number = 0;
336 for(number = 0; number < num_points; number++){
337 *cPtr++ = (*aPtr++) *
lv_conj(*bPtr++);
#define lv_conj(x)
Definition: volk_complex.h:80
float complex lv_32fc_t
Definition: volk_complex.h:56