23 #ifndef INCLUDED_volk_32f_s32f_convert_8i_u_H
24 #define INCLUDED_volk_32f_s32f_convert_8i_u_H
30 #include <emmintrin.h>
39 static inline void volk_32f_s32f_convert_8i_u_sse2(
int8_t* outputVector,
const float* inputVector,
const float scalar,
unsigned int num_points){
40 unsigned int number = 0;
42 const unsigned int sixteenthPoints = num_points / 16;
44 const float* inputVectorPtr = (
const float*)inputVector;
45 int8_t* outputVectorPtr = outputVector;
51 __m128 vScalar = _mm_set_ps1(scalar);
52 __m128 inputVal1, inputVal2, inputVal3, inputVal4;
53 __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
54 __m128 vmin_val = _mm_set_ps1(min_val);
55 __m128 vmax_val = _mm_set_ps1(max_val);
57 for(;number < sixteenthPoints; number++){
58 inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
59 inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
60 inputVal3 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
61 inputVal4 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
63 inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
64 inputVal2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
65 inputVal3 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
66 inputVal4 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
68 intInputVal1 = _mm_cvtps_epi32(inputVal1);
69 intInputVal2 = _mm_cvtps_epi32(inputVal2);
70 intInputVal3 = _mm_cvtps_epi32(inputVal3);
71 intInputVal4 = _mm_cvtps_epi32(inputVal4);
73 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
74 intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
76 intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
78 _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
79 outputVectorPtr += 16;
82 number = sixteenthPoints * 16;
83 for(; number < num_points; number++){
84 r = inputVector[number] * scalar;
89 outputVector[number] = (
int16_t)(r);
95 #include <xmmintrin.h>
104 static inline void volk_32f_s32f_convert_8i_u_sse(
int8_t* outputVector,
const float* inputVector,
const float scalar,
unsigned int num_points){
105 unsigned int number = 0;
107 const unsigned int quarterPoints = num_points / 4;
109 const float* inputVectorPtr = (
const float*)inputVector;
110 int8_t* outputVectorPtr = outputVector;
112 float min_val = -128;
116 __m128 vScalar = _mm_set_ps1(scalar);
118 __m128 vmin_val = _mm_set_ps1(min_val);
119 __m128 vmax_val = _mm_set_ps1(max_val);
123 for(;number < quarterPoints; number++){
124 ret = _mm_loadu_ps(inputVectorPtr);
127 ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
129 _mm_store_ps(outputFloatBuffer, ret);
130 *outputVectorPtr++ = (
int8_t)(outputFloatBuffer[0]);
131 *outputVectorPtr++ = (
int8_t)(outputFloatBuffer[1]);
132 *outputVectorPtr++ = (
int8_t)(outputFloatBuffer[2]);
133 *outputVectorPtr++ = (
int8_t)(outputFloatBuffer[3]);
136 number = quarterPoints * 4;
137 for(; number < num_points; number++){
138 r = inputVector[number] * scalar;
143 outputVector[number] = (
int16_t)(r);
148 #ifdef LV_HAVE_GENERIC
157 static inline void volk_32f_s32f_convert_8i_generic(
int8_t* outputVector,
const float* inputVector,
const float scalar,
unsigned int num_points){
158 int8_t* outputVectorPtr = outputVector;
159 const float* inputVectorPtr = inputVector;
160 unsigned int number = 0;
161 float min_val = -128;
165 for(number = 0; number < num_points; number++){
166 r = *inputVectorPtr++ * scalar;
171 *outputVectorPtr++ = (
int16_t)(r);
180 #ifndef INCLUDED_volk_32f_s32f_convert_8i_a_H
181 #define INCLUDED_volk_32f_s32f_convert_8i_a_H
188 #include <emmintrin.h>
196 static inline void volk_32f_s32f_convert_8i_a_sse2(
int8_t* outputVector,
const float* inputVector,
const float scalar,
unsigned int num_points){
197 unsigned int number = 0;
199 const unsigned int sixteenthPoints = num_points / 16;
201 const float* inputVectorPtr = (
const float*)inputVector;
202 int8_t* outputVectorPtr = outputVector;
204 float min_val = -128;
208 __m128 vScalar = _mm_set_ps1(scalar);
209 __m128 inputVal1, inputVal2, inputVal3, inputVal4;
210 __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
211 __m128 vmin_val = _mm_set_ps1(min_val);
212 __m128 vmax_val = _mm_set_ps1(max_val);
214 for(;number < sixteenthPoints; number++){
215 inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
216 inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
217 inputVal3 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
218 inputVal4 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
220 inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
221 inputVal2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
222 inputVal3 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
223 inputVal4 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
225 intInputVal1 = _mm_cvtps_epi32(inputVal1);
226 intInputVal2 = _mm_cvtps_epi32(inputVal2);
227 intInputVal3 = _mm_cvtps_epi32(inputVal3);
228 intInputVal4 = _mm_cvtps_epi32(inputVal4);
230 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
231 intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
233 intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
235 _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
236 outputVectorPtr += 16;
239 number = sixteenthPoints * 16;
240 for(; number < num_points; number++){
241 r = inputVector[number] * scalar;
246 outputVector[number] = (
int8_t)(r);
252 #include <xmmintrin.h>
260 static inline void volk_32f_s32f_convert_8i_a_sse(
int8_t* outputVector,
const float* inputVector,
const float scalar,
unsigned int num_points){
261 unsigned int number = 0;
263 const unsigned int quarterPoints = num_points / 4;
265 const float* inputVectorPtr = (
const float*)inputVector;
267 float min_val = -128;
271 int8_t* outputVectorPtr = outputVector;
272 __m128 vScalar = _mm_set_ps1(scalar);
274 __m128 vmin_val = _mm_set_ps1(min_val);
275 __m128 vmax_val = _mm_set_ps1(max_val);
279 for(;number < quarterPoints; number++){
280 ret = _mm_load_ps(inputVectorPtr);
283 ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
285 _mm_store_ps(outputFloatBuffer, ret);
286 *outputVectorPtr++ = (
int8_t)(outputFloatBuffer[0]);
287 *outputVectorPtr++ = (
int8_t)(outputFloatBuffer[1]);
288 *outputVectorPtr++ = (
int8_t)(outputFloatBuffer[2]);
289 *outputVectorPtr++ = (
int8_t)(outputFloatBuffer[3]);
292 number = quarterPoints * 4;
293 for(; number < num_points; number++){
294 r = inputVector[number] * scalar;
299 outputVector[number] = (
int8_t)(r);
304 #ifdef LV_HAVE_GENERIC
312 static inline void volk_32f_s32f_convert_8i_a_generic(
int8_t* outputVector,
const float* inputVector,
const float scalar,
unsigned int num_points){
313 int8_t* outputVectorPtr = outputVector;
314 const float* inputVectorPtr = inputVector;
315 unsigned int number = 0;
316 float min_val = -128;
320 for(number = 0; number < num_points; number++){
321 r = *inputVectorPtr++ * scalar;
326 *outputVectorPtr++ = (
int8_t)(r);
signed short int16_t
Definition: stdint.h:76
signed char int8_t
Definition: stdint.h:75
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:27