23 #ifndef INCLUDED_volk_32f_x2_max_32f_a_H
24 #define INCLUDED_volk_32f_x2_max_32f_a_H
30 #include <xmmintrin.h>
38 static inline void volk_32f_x2_max_32f_a_sse(
float* cVector,
const float* aVector,
const float* bVector,
unsigned int num_points){
39 unsigned int number = 0;
40 const unsigned int quarterPoints = num_points / 4;
42 float* cPtr = cVector;
43 const float* aPtr = aVector;
44 const float* bPtr= bVector;
46 __m128 aVal, bVal, cVal;
47 for(;number < quarterPoints; number++){
49 aVal = _mm_load_ps(aPtr);
50 bVal = _mm_load_ps(bPtr);
52 cVal = _mm_max_ps(aVal, bVal);
54 _mm_store_ps(cPtr,cVal);
61 number = quarterPoints * 4;
62 for(;number < num_points; number++){
63 const float a = *aPtr++;
64 const float b = *bPtr++;
65 *cPtr++ = ( a > b ? a : b);
79 static inline void volk_32f_x2_max_32f_neon(
float* cVector,
const float* aVector,
const float* bVector,
unsigned int num_points){
80 unsigned int quarter_points = num_points / 4;
81 float* cPtr = cVector;
82 const float* aPtr = aVector;
83 const float* bPtr= bVector;
84 unsigned int number = 0;
86 float32x4_t a_vec, b_vec, c_vec;
87 for(number = 0; number < quarter_points; number++){
88 a_vec = vld1q_f32(aPtr);
89 b_vec = vld1q_f32(bPtr);
90 c_vec = vmaxq_f32(a_vec, b_vec);
91 vst1q_f32(cPtr, c_vec);
97 for(number = quarter_points*4; number < num_points; number++){
98 const float a = *aPtr++;
99 const float b = *bPtr++;
100 *cPtr++ = ( a > b ? a : b);
106 #ifdef LV_HAVE_GENERIC
114 static inline void volk_32f_x2_max_32f_generic(
float* cVector,
const float* aVector,
const float* bVector,
unsigned int num_points){
115 float* cPtr = cVector;
116 const float* aPtr = aVector;
117 const float* bPtr= bVector;
118 unsigned int number = 0;
120 for(number = 0; number < num_points; number++){
121 const float a = *aPtr++;
122 const float b = *bPtr++;
123 *cPtr++ = ( a > b ? a : b);
136 extern void volk_32f_x2_max_32f_a_orc_impl(
float* cVector,
const float* aVector,
const float* bVector,
unsigned int num_points);
137 static inline void volk_32f_x2_max_32f_u_orc(
float* cVector,
const float* aVector,
const float* bVector,
unsigned int num_points){
138 volk_32f_x2_max_32f_a_orc_impl(cVector, aVector, bVector, num_points);