23 #ifndef INCLUDED_volk_32f_x2_multiply_32f_u_H
24 #define INCLUDED_volk_32f_x2_multiply_32f_u_H
30 #include <xmmintrin.h>
38 static inline void volk_32f_x2_multiply_32f_u_sse(
float* cVector,
const float* aVector,
const float* bVector,
unsigned int num_points){
39 unsigned int number = 0;
40 const unsigned int quarterPoints = num_points / 4;
42 float* cPtr = cVector;
43 const float* aPtr = aVector;
44 const float* bPtr= bVector;
46 __m128 aVal, bVal, cVal;
47 for(;number < quarterPoints; number++){
49 aVal = _mm_loadu_ps(aPtr);
50 bVal = _mm_loadu_ps(bPtr);
52 cVal = _mm_mul_ps(aVal, bVal);
54 _mm_storeu_ps(cPtr,cVal);
61 number = quarterPoints * 4;
62 for(;number < num_points; number++){
63 *cPtr++ = (*aPtr++) * (*bPtr++);
69 #include <immintrin.h>
77 static inline void volk_32f_x2_multiply_32f_u_avx(
float* cVector,
const float* aVector,
const float* bVector,
unsigned int num_points){
78 unsigned int number = 0;
79 const unsigned int eighthPoints = num_points / 8;
81 float* cPtr = cVector;
82 const float* aPtr = aVector;
83 const float* bPtr= bVector;
85 __m256 aVal, bVal, cVal;
86 for(;number < eighthPoints; number++){
88 aVal = _mm256_loadu_ps(aPtr);
89 bVal = _mm256_loadu_ps(bPtr);
91 cVal = _mm256_mul_ps(aVal, bVal);
93 _mm256_storeu_ps(cPtr,cVal);
100 number = eighthPoints * 8;
101 for(;number < num_points; number++){
102 *cPtr++ = (*aPtr++) * (*bPtr++);
107 #ifdef LV_HAVE_GENERIC
115 static inline void volk_32f_x2_multiply_32f_generic(
float* cVector,
const float* aVector,
const float* bVector,
unsigned int num_points){
116 float* cPtr = cVector;
117 const float* aPtr = aVector;
118 const float* bPtr= bVector;
119 unsigned int number = 0;
121 for(number = 0; number < num_points; number++){
122 *cPtr++ = (*aPtr++) * (*bPtr++);
129 #ifndef INCLUDED_volk_32f_x2_multiply_32f_a_H
130 #define INCLUDED_volk_32f_x2_multiply_32f_a_H
136 #include <xmmintrin.h>
144 static inline void volk_32f_x2_multiply_32f_a_sse(
float* cVector,
const float* aVector,
const float* bVector,
unsigned int num_points){
145 unsigned int number = 0;
146 const unsigned int quarterPoints = num_points / 4;
148 float* cPtr = cVector;
149 const float* aPtr = aVector;
150 const float* bPtr= bVector;
152 __m128 aVal, bVal, cVal;
153 for(;number < quarterPoints; number++){
155 aVal = _mm_load_ps(aPtr);
156 bVal = _mm_load_ps(bPtr);
158 cVal = _mm_mul_ps(aVal, bVal);
160 _mm_store_ps(cPtr,cVal);
167 number = quarterPoints * 4;
168 for(;number < num_points; number++){
169 *cPtr++ = (*aPtr++) * (*bPtr++);
175 #include <immintrin.h>
183 static inline void volk_32f_x2_multiply_32f_a_avx(
float* cVector,
const float* aVector,
const float* bVector,
unsigned int num_points){
184 unsigned int number = 0;
185 const unsigned int eighthPoints = num_points / 8;
187 float* cPtr = cVector;
188 const float* aPtr = aVector;
189 const float* bPtr= bVector;
191 __m256 aVal, bVal, cVal;
192 for(;number < eighthPoints; number++){
194 aVal = _mm256_load_ps(aPtr);
195 bVal = _mm256_load_ps(bPtr);
197 cVal = _mm256_mul_ps(aVal, bVal);
199 _mm256_store_ps(cPtr,cVal);
206 number = eighthPoints * 8;
207 for(;number < num_points; number++){
208 *cPtr++ = (*aPtr++) * (*bPtr++);
214 #include <arm_neon.h>
223 static inline void volk_32f_x2_multiply_32f_neon(
float* cVector,
const float* aVector,
const float* bVector,
unsigned int num_points){
224 const unsigned int quarter_points = num_points / 4;
226 float32x4_t avec, bvec, cvec;
227 for(number=0; number < quarter_points; ++number) {
228 avec = vld1q_f32(aVector);
229 bvec = vld1q_f32(bVector);
230 cvec = vmulq_f32(avec, bvec);
231 vst1q_f32(cVector, cvec);
236 for(number=quarter_points*4; number < num_points; ++number) {
237 *cVector++ = *aVector++ * *bVector++;
242 #ifdef LV_HAVE_GENERIC
250 static inline void volk_32f_x2_multiply_32f_a_generic(
float* cVector,
const float* aVector,
const float* bVector,
unsigned int num_points){
251 float* cPtr = cVector;
252 const float* aPtr = aVector;
253 const float* bPtr= bVector;
254 unsigned int number = 0;
256 for(number = 0; number < num_points; number++){
257 *cPtr++ = (*aPtr++) * (*bPtr++);
270 extern void volk_32f_x2_multiply_32f_a_orc_impl(
float* cVector,
const float* aVector,
const float* bVector,
unsigned int num_points);
271 static inline void volk_32f_x2_multiply_32f_u_orc(
float* cVector,
const float* aVector,
const float* bVector,
unsigned int num_points){
272 volk_32f_x2_multiply_32f_a_orc_impl(cVector, aVector, bVector, num_points);