23 #ifndef INCLUDED_volk_32u_byteswap_u_H
24 #define INCLUDED_volk_32u_byteswap_u_H
30 #include <emmintrin.h>
37 static inline void volk_32u_byteswap_u_sse2(
uint32_t* intsToSwap,
unsigned int num_points){
38 unsigned int number = 0;
41 __m128i input, byte1, byte2, byte3, byte4, output;
42 __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
43 __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
45 const uint64_t quarterPoints = num_points / 4;
46 for(;number < quarterPoints; number++){
48 input = _mm_loadu_si128((__m128i*)inputPtr);
50 byte1 = _mm_slli_epi32(input, 24);
51 byte2 = _mm_slli_epi32(input, 8);
52 byte3 = _mm_srli_epi32(input, 8);
53 byte4 = _mm_srli_epi32(input, 24);
55 output = _mm_or_si128(byte1, byte4);
56 byte2 = _mm_and_si128(byte2, byte2mask);
57 output = _mm_or_si128(output, byte2);
58 byte3 = _mm_and_si128(byte3, byte3mask);
59 output = _mm_or_si128(output, byte3);
61 _mm_storeu_si128((__m128i*)inputPtr, output);
66 number = quarterPoints*4;
67 for(; number < num_points; number++){
69 outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
70 *inputPtr = outputVal;
83 static inline void volk_32u_byteswap_neon(
uint32_t* intsToSwap,
unsigned int num_points){
85 unsigned int number = 0;
86 unsigned int n8points = num_points / 8;
88 uint8x8x4_t input_table;
89 uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
90 uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
100 int_lookup01 = vcreate_u8(74609667900706840);
101 int_lookup23 = vcreate_u8(219290013576860186);
102 int_lookup45 = vcreate_u8(363970359253013532);
103 int_lookup67 = vcreate_u8(508650704929166878);
105 for(number = 0; number < n8points; ++number){
106 input_table = vld4_u8((
uint8_t*) inputPtr);
107 swapped_int01 = vtbl4_u8(input_table, int_lookup01);
108 swapped_int23 = vtbl4_u8(input_table, int_lookup23);
109 swapped_int45 = vtbl4_u8(input_table, int_lookup45);
110 swapped_int67 = vtbl4_u8(input_table, int_lookup67);
111 vst1_u8((
uint8_t*) inputPtr, swapped_int01);
112 vst1_u8((
uint8_t*) (inputPtr+2), swapped_int23);
113 vst1_u8((
uint8_t*) (inputPtr+4), swapped_int45);
114 vst1_u8((
uint8_t*) (inputPtr+6), swapped_int67);
119 for(number = n8points * 8; number < num_points; ++number){
121 output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
129 #ifdef LV_HAVE_GENERIC
135 static inline void volk_32u_byteswap_generic(
uint32_t* intsToSwap,
unsigned int num_points){
139 for(point = 0; point < num_points; point++){
141 output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
151 #ifndef INCLUDED_volk_32u_byteswap_a_H
152 #define INCLUDED_volk_32u_byteswap_a_H
158 #include <emmintrin.h>
165 static inline void volk_32u_byteswap_a_sse2(
uint32_t* intsToSwap,
unsigned int num_points){
166 unsigned int number = 0;
169 __m128i input, byte1, byte2, byte3, byte4, output;
170 __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
171 __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
173 const uint64_t quarterPoints = num_points / 4;
174 for(;number < quarterPoints; number++){
176 input = _mm_load_si128((__m128i*)inputPtr);
178 byte1 = _mm_slli_epi32(input, 24);
179 byte2 = _mm_slli_epi32(input, 8);
180 byte3 = _mm_srli_epi32(input, 8);
181 byte4 = _mm_srli_epi32(input, 24);
183 output = _mm_or_si128(byte1, byte4);
184 byte2 = _mm_and_si128(byte2, byte2mask);
185 output = _mm_or_si128(output, byte2);
186 byte3 = _mm_and_si128(byte3, byte3mask);
187 output = _mm_or_si128(output, byte3);
189 _mm_store_si128((__m128i*)inputPtr, output);
194 number = quarterPoints*4;
195 for(; number < num_points; number++){
197 outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
198 *inputPtr = outputVal;
204 #ifdef LV_HAVE_GENERIC
210 static inline void volk_32u_byteswap_a_generic(
uint32_t* intsToSwap,
unsigned int num_points){
214 for(point = 0; point < num_points; point++){
216 output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
unsigned char uint8_t
Definition: stdint.h:78
unsigned int uint32_t
Definition: stdint.h:80
unsigned __int64 uint64_t
Definition: stdint.h:90