23 #ifndef INCLUDED_volk_16i_s32f_convert_32f_u_H
24 #define INCLUDED_volk_16i_s32f_convert_32f_u_H
30 #include <immintrin.h>
39 static inline void volk_16i_s32f_convert_32f_u_avx(
float* outputVector,
const int16_t* inputVector,
const float scalar,
unsigned int num_points){
40 unsigned int number = 0;
41 const unsigned int eighthPoints = num_points / 8;
43 float* outputVectorPtr = outputVector;
44 __m128 invScalar = _mm_set_ps1(1.0/scalar);
46 __m128i inputVal, inputVal2;
49 __m256 dummy = _mm256_setzero_ps();
51 for(;number < eighthPoints; number++){
55 inputVal = _mm_loadu_si128((__m128i*)inputPtr);
58 inputVal2 = _mm_srli_si128(inputVal, 8);
61 inputVal = _mm_cvtepi16_epi32(inputVal);
62 inputVal2 = _mm_cvtepi16_epi32(inputVal2);
64 ret = _mm_cvtepi32_ps(inputVal);
65 ret = _mm_mul_ps(ret, invScalar);
66 output = _mm256_insertf128_ps(dummy, ret, 0);
68 ret = _mm_cvtepi32_ps(inputVal2);
69 ret = _mm_mul_ps(ret, invScalar);
70 output = _mm256_insertf128_ps(output, ret, 1);
72 _mm256_storeu_ps(outputVectorPtr, output);
79 number = eighthPoints * 8;
80 for(; number < num_points; number++){
81 outputVector[number] =((float)(inputVector[number])) / scalar;
87 #include <smmintrin.h>
97 static inline void volk_16i_s32f_convert_32f_u_sse4_1(
float* outputVector,
const int16_t* inputVector,
const float scalar,
unsigned int num_points){
98 unsigned int number = 0;
99 const unsigned int eighthPoints = num_points / 8;
101 float* outputVectorPtr = outputVector;
102 __m128 invScalar = _mm_set_ps1(1.0/scalar);
108 for(;number < eighthPoints; number++){
111 inputVal = _mm_loadu_si128((__m128i*)inputPtr);
114 inputVal2 = _mm_srli_si128(inputVal, 8);
117 inputVal = _mm_cvtepi16_epi32(inputVal);
118 inputVal2 = _mm_cvtepi16_epi32(inputVal2);
120 ret = _mm_cvtepi32_ps(inputVal);
121 ret = _mm_mul_ps(ret, invScalar);
122 _mm_storeu_ps(outputVectorPtr, ret);
123 outputVectorPtr += 4;
125 ret = _mm_cvtepi32_ps(inputVal2);
126 ret = _mm_mul_ps(ret, invScalar);
127 _mm_storeu_ps(outputVectorPtr, ret);
129 outputVectorPtr += 4;
134 number = eighthPoints * 8;
135 for(; number < num_points; number++){
136 outputVector[number] =((float)(inputVector[number])) / scalar;
142 #include <xmmintrin.h>
152 static inline void volk_16i_s32f_convert_32f_u_sse(
float* outputVector,
const int16_t* inputVector,
const float scalar,
unsigned int num_points){
153 unsigned int number = 0;
154 const unsigned int quarterPoints = num_points / 4;
156 float* outputVectorPtr = outputVector;
157 __m128 invScalar = _mm_set_ps1(1.0/scalar);
161 for(;number < quarterPoints; number++){
162 ret = _mm_set_ps((
float)(inputPtr[3]), (
float)(inputPtr[2]), (
float)(inputPtr[1]), (
float)(inputPtr[0]));
164 ret = _mm_mul_ps(ret, invScalar);
165 _mm_storeu_ps(outputVectorPtr, ret);
168 outputVectorPtr += 4;
171 number = quarterPoints * 4;
172 for(; number < num_points; number++){
173 outputVector[number] = (float)(inputVector[number]) / scalar;
178 #ifdef LV_HAVE_GENERIC
187 static inline void volk_16i_s32f_convert_32f_generic(
float* outputVector,
const int16_t* inputVector,
const float scalar,
unsigned int num_points){
188 float* outputVectorPtr = outputVector;
189 const int16_t* inputVectorPtr = inputVector;
190 unsigned int number = 0;
192 for(number = 0; number < num_points; number++){
193 *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
207 static inline void volk_16i_s32f_convert_32f_neon(
float* outputVector,
const int16_t* inputVector,
const float scalar,
unsigned int num_points){
208 float* outputPtr = outputVector;
209 const int16_t* inputPtr = inputVector;
210 unsigned int number = 0;
211 unsigned int eighth_points = num_points / 8;
214 int32x4_t input32_0, input32_1;
215 float32x4_t input_float_0, input_float_1;
216 float32x4x2_t output_float;
217 float32x4_t inv_scale;
219 inv_scale = vdupq_n_f32(1.0/scalar);
225 for(number = 0; number < eighth_points; number++){
226 input16 = vld2_s16(inputPtr);
228 input32_0 = vmovl_s16(input16.val[0]);
229 input32_1 = vmovl_s16(input16.val[1]);
231 input_float_0 = vcvtq_f32_s32(input32_0);
232 input_float_1 = vcvtq_f32_s32(input32_1);
233 output_float.val[0] = vmulq_f32(input_float_0, inv_scale);
234 output_float.val[1] = vmulq_f32(input_float_1, inv_scale);
235 vst2q_f32(outputPtr, output_float);
240 for(number = eighth_points*8; number < num_points; number++){
241 *outputPtr++ = ((float)(*inputPtr++)) / scalar;
248 #ifndef INCLUDED_volk_16i_s32f_convert_32f_a_H
249 #define INCLUDED_volk_16i_s32f_convert_32f_a_H
255 #include <immintrin.h>
264 static inline void volk_16i_s32f_convert_32f_a_avx(
float* outputVector,
const int16_t* inputVector,
const float scalar,
unsigned int num_points){
265 unsigned int number = 0;
266 const unsigned int eighthPoints = num_points / 8;
268 float* outputVectorPtr = outputVector;
269 __m128 invScalar = _mm_set_ps1(1.0/scalar);
271 __m128i inputVal, inputVal2;
274 __m256 dummy = _mm256_setzero_ps();
276 for(;number < eighthPoints; number++){
280 inputVal = _mm_load_si128((__m128i*)inputPtr);
283 inputVal2 = _mm_srli_si128(inputVal, 8);
286 inputVal = _mm_cvtepi16_epi32(inputVal);
287 inputVal2 = _mm_cvtepi16_epi32(inputVal2);
289 ret = _mm_cvtepi32_ps(inputVal);
290 ret = _mm_mul_ps(ret, invScalar);
291 output = _mm256_insertf128_ps(dummy, ret, 0);
293 ret = _mm_cvtepi32_ps(inputVal2);
294 ret = _mm_mul_ps(ret, invScalar);
295 output = _mm256_insertf128_ps(output, ret, 1);
297 _mm256_store_ps(outputVectorPtr, output);
299 outputVectorPtr += 8;
304 number = eighthPoints * 8;
305 for(; number < num_points; number++){
306 outputVector[number] =((float)(inputVector[number])) / scalar;
311 #ifdef LV_HAVE_SSE4_1
312 #include <smmintrin.h>
321 static inline void volk_16i_s32f_convert_32f_a_sse4_1(
float* outputVector,
const int16_t* inputVector,
const float scalar,
unsigned int num_points){
322 unsigned int number = 0;
323 const unsigned int eighthPoints = num_points / 8;
325 float* outputVectorPtr = outputVector;
326 __m128 invScalar = _mm_set_ps1(1.0/scalar);
332 for(;number < eighthPoints; number++){
335 inputVal = _mm_loadu_si128((__m128i*)inputPtr);
338 inputVal2 = _mm_srli_si128(inputVal, 8);
341 inputVal = _mm_cvtepi16_epi32(inputVal);
342 inputVal2 = _mm_cvtepi16_epi32(inputVal2);
344 ret = _mm_cvtepi32_ps(inputVal);
345 ret = _mm_mul_ps(ret, invScalar);
346 _mm_storeu_ps(outputVectorPtr, ret);
347 outputVectorPtr += 4;
349 ret = _mm_cvtepi32_ps(inputVal2);
350 ret = _mm_mul_ps(ret, invScalar);
351 _mm_storeu_ps(outputVectorPtr, ret);
353 outputVectorPtr += 4;
358 number = eighthPoints * 8;
359 for(; number < num_points; number++){
360 outputVector[number] =((float)(inputVector[number])) / scalar;
366 #include <xmmintrin.h>
375 static inline void volk_16i_s32f_convert_32f_a_sse(
float* outputVector,
const int16_t* inputVector,
const float scalar,
unsigned int num_points){
376 unsigned int number = 0;
377 const unsigned int quarterPoints = num_points / 4;
379 float* outputVectorPtr = outputVector;
380 __m128 invScalar = _mm_set_ps1(1.0/scalar);
384 for(;number < quarterPoints; number++){
385 ret = _mm_set_ps((
float)(inputPtr[3]), (
float)(inputPtr[2]), (
float)(inputPtr[1]), (
float)(inputPtr[0]));
387 ret = _mm_mul_ps(ret, invScalar);
388 _mm_storeu_ps(outputVectorPtr, ret);
391 outputVectorPtr += 4;
394 number = quarterPoints * 4;
395 for(; number < num_points; number++){
396 outputVector[number] = (float)(inputVector[number]) / scalar;
401 #ifdef LV_HAVE_GENERIC
409 static inline void volk_16i_s32f_convert_32f_a_generic(
float* outputVector,
const int16_t* inputVector,
const float scalar,
unsigned int num_points){
410 float* outputVectorPtr = outputVector;
411 const int16_t* inputVectorPtr = inputVector;
412 unsigned int number = 0;
414 for(number = 0; number < num_points; number++){
415 *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
signed short int16_t
Definition: stdint.h:76