23 #ifndef INCLUDED_volk_32fc_32f_multiply_32fc_a_H
24 #define INCLUDED_volk_32fc_32f_multiply_32fc_a_H
30 #include <immintrin.h>
38 static inline void volk_32fc_32f_multiply_32fc_a_avx(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const float* bVector,
unsigned int num_points)
40 unsigned int number = 0;
41 const unsigned int eighthPoints = num_points / 8;
45 const float* bPtr= bVector;
47 __m256 aVal1, aVal2, bVal, bVal1, bVal2, cVal1, cVal2;
49 __m256i permute_mask = _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0);
51 for(;number < eighthPoints; number++){
53 aVal1 = _mm256_load_ps((
float *)aPtr);
56 aVal2 = _mm256_load_ps((
float *)aPtr);
59 bVal = _mm256_load_ps(bPtr);
62 bVal1 = _mm256_permute2f128_ps(bVal, bVal, 0x00);
63 bVal2 = _mm256_permute2f128_ps(bVal, bVal, 0x11);
65 bVal1 = _mm256_permutevar_ps(bVal1, permute_mask);
66 bVal2 = _mm256_permutevar_ps(bVal2, permute_mask);
68 cVal1 = _mm256_mul_ps(aVal1, bVal1);
69 cVal2 = _mm256_mul_ps(aVal2, bVal2);
71 _mm256_store_ps((
float*)cPtr,cVal1);
74 _mm256_store_ps((
float*)cPtr,cVal2);
78 number = eighthPoints * 8;
79 for(;number < num_points; ++number){
80 *cPtr++ = (*aPtr++) * (*bPtr++);
88 #include <xmmintrin.h>
96 static inline void volk_32fc_32f_multiply_32fc_a_sse(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const float* bVector,
unsigned int num_points)
98 unsigned int number = 0;
99 const unsigned int quarterPoints = num_points / 4;
103 const float* bPtr= bVector;
105 __m128 aVal1, aVal2, bVal, bVal1, bVal2, cVal;
106 for(;number < quarterPoints; number++){
108 aVal1 = _mm_load_ps((
const float*)aPtr);
111 aVal2 = _mm_load_ps((
const float*)aPtr);
114 bVal = _mm_load_ps(bPtr);
117 bVal1 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(1,1,0,0));
118 bVal2 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(3,3,2,2));
120 cVal = _mm_mul_ps(aVal1, bVal1);
122 _mm_store_ps((
float*)cPtr,cVal);
125 cVal = _mm_mul_ps(aVal2, bVal2);
127 _mm_store_ps((
float*)cPtr,cVal);
132 number = quarterPoints * 4;
133 for(;number < num_points; number++){
134 *cPtr++ = (*aPtr++) * (*bPtr);
140 #ifdef LV_HAVE_GENERIC
148 static inline void volk_32fc_32f_multiply_32fc_generic(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const float* bVector,
unsigned int num_points){
151 const float* bPtr= bVector;
152 unsigned int number = 0;
154 for(number = 0; number < num_points; number++){
155 *cPtr++ = (*aPtr++) * (*bPtr++);
161 #include <arm_neon.h>
169 static inline void volk_32fc_32f_multiply_32fc_neon(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const float* bVector,
unsigned int num_points){
172 const float* bPtr= bVector;
173 unsigned int number = 0;
174 unsigned int quarter_points = num_points / 4;
176 float32x4x2_t inputVector, outputVector;
177 float32x4_t tapsVector;
178 for(number = 0; number < quarter_points; number++){
179 inputVector = vld2q_f32((
float*)aPtr);
180 tapsVector = vld1q_f32(bPtr);
182 outputVector.val[0] = vmulq_f32(inputVector.val[0], tapsVector);
183 outputVector.val[1] = vmulq_f32(inputVector.val[1], tapsVector);
185 vst2q_f32((
float*)cPtr, outputVector);
191 for(number = quarter_points * 4; number < num_points; number++){
192 *cPtr++ = (*aPtr++) * (*bPtr++);
205 extern void volk_32fc_32f_multiply_32fc_a_orc_impl(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const float* bVector,
unsigned int num_points);
206 static inline void volk_32fc_32f_multiply_32fc_u_orc(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const float* bVector,
unsigned int num_points){
207 volk_32fc_32f_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
float complex lv_32fc_t
Definition: volk_complex.h:56