23 #ifndef INCLUDED_volk_32f_s32f_multiply_32f_u_H
24 #define INCLUDED_volk_32f_s32f_multiply_32f_u_H
30 #include <xmmintrin.h>
38 static inline void volk_32f_s32f_multiply_32f_u_sse(
float* cVector,
const float* aVector,
const float scalar,
unsigned int num_points){
39 unsigned int number = 0;
40 const unsigned int quarterPoints = num_points / 4;
42 float* cPtr = cVector;
43 const float* aPtr = aVector;
45 __m128 aVal, bVal, cVal;
46 bVal = _mm_set_ps1(scalar);
47 for(;number < quarterPoints; number++){
49 aVal = _mm_loadu_ps(aPtr);
51 cVal = _mm_mul_ps(aVal, bVal);
53 _mm_storeu_ps(cPtr,cVal);
59 number = quarterPoints * 4;
60 for(;number < num_points; number++){
61 *cPtr++ = (*aPtr++) * scalar;
67 #include <immintrin.h>
75 static inline void volk_32f_s32f_multiply_32f_u_avx(
float* cVector,
const float* aVector,
const float scalar,
unsigned int num_points){
76 unsigned int number = 0;
77 const unsigned int eighthPoints = num_points / 8;
79 float* cPtr = cVector;
80 const float* aPtr = aVector;
82 __m256 aVal, bVal, cVal;
83 bVal = _mm256_set1_ps(scalar);
84 for(;number < eighthPoints; number++){
86 aVal = _mm256_loadu_ps(aPtr);
88 cVal = _mm256_mul_ps(aVal, bVal);
90 _mm256_storeu_ps(cPtr,cVal);
96 number = eighthPoints * 8;
97 for(;number < num_points; number++){
98 *cPtr++ = (*aPtr++) * scalar;
103 #ifdef LV_HAVE_GENERIC
111 static inline void volk_32f_s32f_multiply_32f_generic(
float* cVector,
const float* aVector,
const float scalar,
unsigned int num_points){
112 unsigned int number = 0;
113 const float* inputPtr = aVector;
114 float* outputPtr = cVector;
115 for(number = 0; number < num_points; number++){
116 *outputPtr = (*inputPtr) * scalar;
125 #ifndef INCLUDED_volk_32f_s32f_multiply_32f_a_H
126 #define INCLUDED_volk_32f_s32f_multiply_32f_a_H
132 #include <xmmintrin.h>
140 static inline void volk_32f_s32f_multiply_32f_a_sse(
float* cVector,
const float* aVector,
const float scalar,
unsigned int num_points){
141 unsigned int number = 0;
142 const unsigned int quarterPoints = num_points / 4;
144 float* cPtr = cVector;
145 const float* aPtr = aVector;
147 __m128 aVal, bVal, cVal;
148 bVal = _mm_set_ps1(scalar);
149 for(;number < quarterPoints; number++){
151 aVal = _mm_load_ps(aPtr);
153 cVal = _mm_mul_ps(aVal, bVal);
155 _mm_store_ps(cPtr,cVal);
161 number = quarterPoints * 4;
162 for(;number < num_points; number++){
163 *cPtr++ = (*aPtr++) * scalar;
169 #include <immintrin.h>
177 static inline void volk_32f_s32f_multiply_32f_a_avx(
float* cVector,
const float* aVector,
const float scalar,
unsigned int num_points){
178 unsigned int number = 0;
179 const unsigned int eighthPoints = num_points / 8;
181 float* cPtr = cVector;
182 const float* aPtr = aVector;
184 __m256 aVal, bVal, cVal;
185 bVal = _mm256_set1_ps(scalar);
186 for(;number < eighthPoints; number++){
188 aVal = _mm256_load_ps(aPtr);
190 cVal = _mm256_mul_ps(aVal, bVal);
192 _mm256_store_ps(cPtr,cVal);
198 number = eighthPoints * 8;
199 for(;number < num_points; number++){
200 *cPtr++ = (*aPtr++) * scalar;
206 #include <arm_neon.h>
214 static inline void volk_32f_s32f_multiply_32f_u_neon(
float* cVector,
const float* aVector,
const float scalar,
unsigned int num_points){
215 unsigned int number = 0;
216 const float* inputPtr = aVector;
217 float* outputPtr = cVector;
218 const unsigned int quarterPoints = num_points / 4;
220 float32x4_t aVal, cVal;
222 for(number = 0; number < quarterPoints; number++){
223 aVal = vld1q_f32(inputPtr);
224 cVal = vmulq_n_f32 (aVal, scalar);
225 vst1q_f32(outputPtr, cVal);
229 for(number = quarterPoints * 4; number < num_points; number++){
230 *outputPtr++ = (*inputPtr++) * scalar;
235 #ifdef LV_HAVE_GENERIC
243 static inline void volk_32f_s32f_multiply_32f_a_generic(
float* cVector,
const float* aVector,
const float scalar,
unsigned int num_points){
244 unsigned int number = 0;
245 const float* inputPtr = aVector;
246 float* outputPtr = cVector;
247 for(number = 0; number < num_points; number++){
248 *outputPtr = (*inputPtr) * scalar;
263 extern void volk_32f_s32f_multiply_32f_a_orc_impl(
float* dst,
const float* src,
const float scalar,
unsigned int num_points);
264 static inline void volk_32f_s32f_multiply_32f_u_orc(
float* cVector,
const float* aVector,
const float scalar,
unsigned int num_points){
265 volk_32f_s32f_multiply_32f_a_orc_impl(cVector, aVector, scalar, num_points);