23 #ifndef INCLUDED_volk_32f_x2_add_32f_u_H
24 #define INCLUDED_volk_32f_x2_add_32f_u_H
30 #include <xmmintrin.h>
38 static inline void volk_32f_x2_add_32f_u_sse(
float* cVector,
const float* aVector,
const float* bVector,
unsigned int num_points){
39 unsigned int number = 0;
40 const unsigned int quarterPoints = num_points / 4;
42 float* cPtr = cVector;
43 const float* aPtr = aVector;
44 const float* bPtr= bVector;
46 __m128 aVal, bVal, cVal;
47 for(;number < quarterPoints; number++){
49 aVal = _mm_loadu_ps(aPtr);
50 bVal = _mm_loadu_ps(bPtr);
52 cVal = _mm_add_ps(aVal, bVal);
54 _mm_storeu_ps(cPtr,cVal);
61 number = quarterPoints * 4;
62 for(;number < num_points; number++){
63 *cPtr++ = (*aPtr++) + (*bPtr++);
68 #ifdef LV_HAVE_GENERIC
76 static inline void volk_32f_x2_add_32f_generic(
float* cVector,
const float* aVector,
const float* bVector,
unsigned int num_points){
77 float* cPtr = cVector;
78 const float* aPtr = aVector;
79 const float* bPtr= bVector;
80 unsigned int number = 0;
82 for(number = 0; number < num_points; number++){
83 *cPtr++ = (*aPtr++) + (*bPtr++);
89 #ifndef INCLUDED_volk_32f_x2_add_32f_a_H
90 #define INCLUDED_volk_32f_x2_add_32f_a_H
96 #include <xmmintrin.h>
104 static inline void volk_32f_x2_add_32f_a_sse(
float* cVector,
const float* aVector,
const float* bVector,
unsigned int num_points){
105 unsigned int number = 0;
106 const unsigned int quarterPoints = num_points / 4;
108 float* cPtr = cVector;
109 const float* aPtr = aVector;
110 const float* bPtr= bVector;
112 __m128 aVal, bVal, cVal;
113 for(;number < quarterPoints; number++){
115 aVal = _mm_load_ps(aPtr);
116 bVal = _mm_load_ps(bPtr);
118 cVal = _mm_add_ps(aVal, bVal);
120 _mm_store_ps(cPtr,cVal);
127 number = quarterPoints * 4;
128 for(;number < num_points; number++){
129 *cPtr++ = (*aPtr++) + (*bPtr++);
135 #include <arm_neon.h>
143 static inline void volk_32f_x2_add_32f_u_neon(
float* cVector,
const float* aVector,
const float* bVector,
unsigned int num_points) {
144 unsigned int number = 0;
145 const unsigned int quarterPoints = num_points / 4;
147 float* cPtr = cVector;
148 const float* aPtr = aVector;
149 const float* bPtr= bVector;
150 float32x4_t aVal, bVal, cVal;
151 for(number=0; number < quarterPoints; number++){
153 aVal = vld1q_f32(aPtr);
154 bVal = vld1q_f32(bPtr);
155 __builtin_prefetch(aPtr+4);
156 __builtin_prefetch(bPtr+4);
159 cVal = vaddq_f32(aVal, bVal);
161 vst1q_f32(cPtr,cVal);
168 number = quarterPoints * 4;
169 for(;number < num_points; number++){
170 *cPtr++ = (*aPtr++) + (*bPtr++);
177 #ifdef LV_HAVE_GENERIC
185 static inline void volk_32f_x2_add_32f_a_generic(
float* cVector,
const float* aVector,
const float* bVector,
unsigned int num_points){
186 float* cPtr = cVector;
187 const float* aPtr = aVector;
188 const float* bPtr= bVector;
189 unsigned int number = 0;
191 for(number = 0; number < num_points; number++){
192 *cPtr++ = (*aPtr++) + (*bPtr++);
205 extern void volk_32f_x2_add_32f_a_orc_impl(
float* cVector,
const float* aVector,
const float* bVector,
unsigned int num_points);
206 static inline void volk_32f_x2_add_32f_u_orc(
float* cVector,
const float* aVector,
const float* bVector,
unsigned int num_points){
207 volk_32f_x2_add_32f_a_orc_impl(cVector, aVector, bVector, num_points);