23 #ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H
24 #define INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H
32 #include <immintrin.h>
40 static inline void volk_32fc_s32fc_multiply_32fc_u_avx(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t scalar,
unsigned int num_points){
41 unsigned int number = 0;
43 const unsigned int quarterPoints = num_points / 4;
44 unsigned int isodd = num_points & 3;
45 __m256 x, yl, yh, z, tmp1, tmp2;
50 yl = _mm256_set1_ps(
lv_creal(scalar));
51 yh = _mm256_set1_ps(
lv_cimag(scalar));
53 for(;number < quarterPoints; number++){
54 x = _mm256_loadu_ps((
float*)a);
56 tmp1 = _mm256_mul_ps(x,yl);
58 x = _mm256_shuffle_ps(x,x,0xB1);
60 tmp2 = _mm256_mul_ps(x,yh);
62 z = _mm256_addsub_ps(tmp1,tmp2);
64 _mm256_storeu_ps((
float*)c,z);
70 for(i = num_points-isodd; i < num_points; i++) {
71 *c++ = (*a++) * scalar;
78 #include <pmmintrin.h>
86 static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t scalar,
unsigned int num_points){
87 unsigned int number = 0;
88 const unsigned int halfPoints = num_points / 2;
90 __m128 x, yl, yh, z, tmp1, tmp2;
98 for(;number < halfPoints; number++){
100 x = _mm_loadu_ps((
float*)a);
102 tmp1 = _mm_mul_ps(x,yl);
104 x = _mm_shuffle_ps(x,x,0xB1);
106 tmp2 = _mm_mul_ps(x,yh);
108 z = _mm_addsub_ps(tmp1,tmp2);
110 _mm_storeu_ps((
float*)c,z);
116 if((num_points % 2) != 0) {
122 #ifdef LV_HAVE_GENERIC
130 static inline void volk_32fc_s32fc_multiply_32fc_generic(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t scalar,
unsigned int num_points){
133 unsigned int number = num_points;
137 *cPtr++ = (*aPtr++) * scalar;
138 *cPtr++ = (*aPtr++) * scalar;
139 *cPtr++ = (*aPtr++) * scalar;
140 *cPtr++ = (*aPtr++) * scalar;
141 *cPtr++ = (*aPtr++) * scalar;
142 *cPtr++ = (*aPtr++) * scalar;
143 *cPtr++ = (*aPtr++) * scalar;
144 *cPtr++ = (*aPtr++) * scalar;
150 *cPtr++ = *aPtr++ * scalar;
156 #ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H
157 #define INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H
165 #include <immintrin.h>
173 static inline void volk_32fc_s32fc_multiply_32fc_a_avx(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t scalar,
unsigned int num_points){
174 unsigned int number = 0;
176 const unsigned int quarterPoints = num_points / 4;
177 unsigned int isodd = num_points & 3;
178 __m256 x, yl, yh, z, tmp1, tmp2;
183 yl = _mm256_set1_ps(
lv_creal(scalar));
184 yh = _mm256_set1_ps(
lv_cimag(scalar));
186 for(;number < quarterPoints; number++){
187 x = _mm256_load_ps((
float*)a);
189 tmp1 = _mm256_mul_ps(x,yl);
191 x = _mm256_shuffle_ps(x,x,0xB1);
193 tmp2 = _mm256_mul_ps(x,yh);
195 z = _mm256_addsub_ps(tmp1,tmp2);
197 _mm256_store_ps((
float*)c,z);
203 for(i = num_points-isodd; i < num_points; i++) {
204 *c++ = (*a++) * scalar;
211 #include <pmmintrin.h>
219 static inline void volk_32fc_s32fc_multiply_32fc_a_sse3(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t scalar,
unsigned int num_points){
220 unsigned int number = 0;
221 const unsigned int halfPoints = num_points / 2;
223 __m128 x, yl, yh, z, tmp1, tmp2;
231 for(;number < halfPoints; number++){
233 x = _mm_load_ps((
float*)a);
235 tmp1 = _mm_mul_ps(x,yl);
237 x = _mm_shuffle_ps(x,x,0xB1);
239 tmp2 = _mm_mul_ps(x,yh);
241 z = _mm_addsub_ps(tmp1,tmp2);
243 _mm_store_ps((
float*)c,z);
249 if((num_points % 2) != 0) {
256 #include <arm_neon.h>
264 static inline void volk_32fc_s32fc_multiply_32fc_neon(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t scalar,
unsigned int num_points){
267 unsigned int number = num_points;
268 unsigned int quarter_points = num_points / 4;
270 float32x4x2_t a_val, scalar_val;
271 float32x4x2_t tmp_imag;
273 scalar_val = vld2q_f32((
const float*)&scalar);
274 for(number = 0; number < quarter_points; ++number) {
275 a_val = vld2q_f32((
float*)aPtr);
276 tmp_imag.val[1] = vmulq_f32(a_val.val[1], scalar_val.val[0]);
277 tmp_imag.val[0] = vmulq_f32(a_val.val[0], scalar_val.val[0]);
279 tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], scalar_val.val[1]);
280 tmp_imag.val[0] = vmlaq_f32(tmp_imag.val[0], a_val.val[1], scalar_val.val[1]);
282 vst2q_f32((
float*)cVector, tmp_imag);
287 for(number = quarter_points*4; number < num_points; number++){
288 *cPtr++ = *aPtr++ * scalar;
293 #ifdef LV_HAVE_GENERIC
301 static inline void volk_32fc_s32fc_multiply_32fc_a_generic(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t scalar,
unsigned int num_points){
304 unsigned int number = num_points;
308 *cPtr++ = (*aPtr++) * scalar;
309 *cPtr++ = (*aPtr++) * scalar;
310 *cPtr++ = (*aPtr++) * scalar;
311 *cPtr++ = (*aPtr++) * scalar;
312 *cPtr++ = (*aPtr++) * scalar;
313 *cPtr++ = (*aPtr++) * scalar;
314 *cPtr++ = (*aPtr++) * scalar;
315 *cPtr++ = (*aPtr++) * scalar;
321 *cPtr++ = *aPtr++ * scalar;
float complex lv_32fc_t
Definition: volk_complex.h:56
#define lv_creal(x)
Definition: volk_complex.h:76
#define lv_cimag(x)
Definition: volk_complex.h:78
uint32_t i[4]
Definition: volk_common.h:80