23 #ifndef INCLUDED_volk_32fc_x2_multiply_32fc_u_H
24 #define INCLUDED_volk_32fc_x2_multiply_32fc_u_H
32 #include <immintrin.h>
40 static inline void volk_32fc_x2_multiply_32fc_u_avx(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t* bVector,
unsigned int num_points){
41 unsigned int number = 0;
42 const unsigned int quarterPoints = num_points / 4;
44 __m256 x, y, yl, yh, z, tmp1, tmp2;
49 for(;number < quarterPoints; number++){
51 x = _mm256_loadu_ps((
float*)a);
52 y = _mm256_loadu_ps((
float*)b);
54 yl = _mm256_moveldup_ps(y);
55 yh = _mm256_movehdup_ps(y);
57 tmp1 = _mm256_mul_ps(x,yl);
59 x = _mm256_shuffle_ps(x,x,0xB1);
61 tmp2 = _mm256_mul_ps(x,yh);
63 z = _mm256_addsub_ps(tmp1,tmp2);
65 _mm256_storeu_ps((
float*)c,z);
72 number = quarterPoints * 4;
74 for(; number < num_points; number++) {
75 *c++ = (*a++) * (*b++);
81 #include <pmmintrin.h>
89 static inline void volk_32fc_x2_multiply_32fc_u_sse3(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t* bVector,
unsigned int num_points){
90 unsigned int number = 0;
91 const unsigned int halfPoints = num_points / 2;
93 __m128 x, y, yl, yh, z, tmp1, tmp2;
98 for(;number < halfPoints; number++){
100 x = _mm_loadu_ps((
float*)a);
101 y = _mm_loadu_ps((
float*)b);
103 yl = _mm_moveldup_ps(y);
104 yh = _mm_movehdup_ps(y);
106 tmp1 = _mm_mul_ps(x,yl);
108 x = _mm_shuffle_ps(x,x,0xB1);
110 tmp2 = _mm_mul_ps(x,yh);
112 z = _mm_addsub_ps(tmp1,tmp2);
114 _mm_storeu_ps((
float*)c,z);
121 if((num_points % 2) != 0) {
127 #ifdef LV_HAVE_GENERIC
135 static inline void volk_32fc_x2_multiply_32fc_generic(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t* bVector,
unsigned int num_points){
139 unsigned int number = 0;
141 for(number = 0; number < num_points; number++){
142 *cPtr++ = (*aPtr++) * (*bPtr++);
149 #ifndef INCLUDED_volk_32fc_x2_multiply_32fc_a_H
150 #define INCLUDED_volk_32fc_x2_multiply_32fc_a_H
158 #include <immintrin.h>
166 static inline void volk_32fc_x2_multiply_32fc_a_avx(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t* bVector,
unsigned int num_points){
167 unsigned int number = 0;
168 const unsigned int quarterPoints = num_points / 4;
170 __m256 x, y, yl, yh, z, tmp1, tmp2;
175 for(;number < quarterPoints; number++){
177 x = _mm256_load_ps((
float*)a);
178 y = _mm256_load_ps((
float*)b);
180 yl = _mm256_moveldup_ps(y);
181 yh = _mm256_movehdup_ps(y);
183 tmp1 = _mm256_mul_ps(x,yl);
185 x = _mm256_shuffle_ps(x,x,0xB1);
187 tmp2 = _mm256_mul_ps(x,yh);
189 z = _mm256_addsub_ps(tmp1,tmp2);
191 _mm256_store_ps((
float*)c,z);
198 number = quarterPoints * 4;
200 for(; number < num_points; number++) {
201 *c++ = (*a++) * (*b++);
207 #include <pmmintrin.h>
215 static inline void volk_32fc_x2_multiply_32fc_a_sse3(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t* bVector,
unsigned int num_points){
216 unsigned int number = 0;
217 const unsigned int halfPoints = num_points / 2;
219 __m128 x, y, yl, yh, z, tmp1, tmp2;
223 for(;number < halfPoints; number++){
225 x = _mm_load_ps((
float*)a);
226 y = _mm_load_ps((
float*)b);
228 yl = _mm_moveldup_ps(y);
229 yh = _mm_movehdup_ps(y);
231 tmp1 = _mm_mul_ps(x,yl);
233 x = _mm_shuffle_ps(x,x,0xB1);
235 tmp2 = _mm_mul_ps(x,yh);
237 z = _mm_addsub_ps(tmp1,tmp2);
239 _mm_store_ps((
float*)c,z);
246 if((num_points % 2) != 0) {
252 #ifdef LV_HAVE_GENERIC
260 static inline void volk_32fc_x2_multiply_32fc_a_generic(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t* bVector,
unsigned int num_points){
264 unsigned int number = 0;
266 for(number = 0; number < num_points; number++){
267 *cPtr++ = (*aPtr++) * (*bPtr++);
273 #include <arm_neon.h>
282 static inline void volk_32fc_x2_multiply_32fc_neon(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t* bVector,
unsigned int num_points){
286 unsigned int quarter_points = num_points / 4;
287 float32x4x2_t a_val, b_val, c_val;
288 float32x4x2_t tmp_real, tmp_imag;
289 unsigned int number = 0;
291 for(number = 0; number < quarter_points; ++number) {
292 a_val = vld2q_f32((
float*)a_ptr);
293 b_val = vld2q_f32((
float*)b_ptr);
294 __builtin_prefetch(a_ptr+4);
295 __builtin_prefetch(b_ptr+4);
299 tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
301 tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
305 tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
307 tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
310 c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
311 c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
312 vst2q_f32((
float*)cVector, c_val);
319 for(number = quarter_points*4; number < num_points; number++){
320 *cVector++ = (*a_ptr++) * (*b_ptr++);
334 static inline void volk_32fc_x2_multiply_32fc_neon_opttests(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t* bVector,
unsigned int num_points){
338 unsigned int quarter_points = num_points / 4;
339 float32x4x2_t a_val, b_val;
340 float32x4x2_t tmp_imag;
341 unsigned int number = 0;
343 for(number = 0; number < quarter_points; ++number) {
344 a_val = vld2q_f32((
float*)a_ptr);
345 b_val = vld2q_f32((
float*)b_ptr);
346 __builtin_prefetch(a_ptr+4);
347 __builtin_prefetch(b_ptr+4);
350 tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
351 tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
354 tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]);
355 tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]);
358 vst2q_f32((
float*)cVector, tmp_imag);
365 for(number = quarter_points*4; number < num_points; number++){
366 *cVector++ = (*a_ptr++) * (*b_ptr++);
380 extern void volk_32fc_x2_multiply_32fc_neonasm(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t* bVector,
unsigned int num_points);
391 extern void volk_32fc_x2_multiply_32fc_a_orc_impl(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t* bVector,
unsigned int num_points);
392 static inline void volk_32fc_x2_multiply_32fc_u_orc(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t* bVector,
unsigned int num_points){
393 volk_32fc_x2_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
float complex lv_32fc_t
Definition: volk_complex.h:56