23 #ifndef INCLUDED_volk_16i_max_star_horizontal_16i_a_H
24 #define INCLUDED_volk_16i_max_star_horizontal_16i_a_H
38 static inline void volk_16i_max_star_horizontal_16i_a_ssse3(
int16_t* target,
int16_t* src0,
unsigned int num_points) {
40 const unsigned int num_bytes = num_points*2;
42 const static uint8_t shufmask0[16] = {0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
43 const static uint8_t shufmask1[16] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d};
44 const static uint8_t andmask0[16] = {0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
45 const static uint8_t andmask1[16] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02};
49 __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
50 __m128i xmm5, xmm6, xmm7, xmm8;
52 xmm4 = _mm_load_si128((__m128i*)shufmask0);
53 xmm5 = _mm_load_si128((__m128i*)shufmask1);
54 xmm6 = _mm_load_si128((__m128i*)andmask0);
55 xmm7 = _mm_load_si128((__m128i*)andmask1);
57 __m128i *p_target, *p_src0;
59 p_target = (__m128i*)target;
60 p_src0 = (__m128i*)src0;
62 int bound = num_bytes >> 5;
63 int intermediate = (num_bytes >> 4) & 1;
64 int leftovers = (num_bytes >> 1) & 7;
69 for(i = 0; i < bound; ++i) {
71 xmm0 = _mm_load_si128(p_src0);
72 xmm1 = _mm_load_si128(&p_src0[1]);
76 xmm2 = _mm_xor_si128(xmm2, xmm2);
79 xmm3 = _mm_hsub_epi16(xmm0, xmm1);
81 xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
83 xmm8 = _mm_and_si128(xmm2, xmm6);
84 xmm3 = _mm_and_si128(xmm2, xmm7);
87 xmm8 = _mm_add_epi8(xmm8, xmm4);
88 xmm3 = _mm_add_epi8(xmm3, xmm5);
90 xmm0 = _mm_shuffle_epi8(xmm0, xmm8);
91 xmm1 = _mm_shuffle_epi8(xmm1, xmm3);
94 xmm3 = _mm_add_epi16(xmm0, xmm1);
97 _mm_store_si128(p_target, xmm3);
103 for(i = 0; i < intermediate; ++i) {
105 xmm0 = _mm_load_si128(p_src0);
108 xmm2 = _mm_xor_si128(xmm2, xmm2);
111 xmm3 = _mm_hsub_epi16(xmm0, xmm1);
112 xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
114 xmm8 = _mm_and_si128(xmm2, xmm6);
116 xmm3 = _mm_add_epi8(xmm8, xmm4);
118 xmm0 = _mm_shuffle_epi8(xmm0, xmm3);
120 _mm_storel_pd((
double*)p_target,
bit128_p(&xmm0)->double_vec);
122 p_target = (__m128i*)((
int8_t*)p_target + 8);
126 for(i = (bound << 4) + (intermediate << 3); i < (bound << 4) + (intermediate << 3) + leftovers ; i += 2) {
127 target[i>>1] = ((
int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1];
136 #include <arm_neon.h>
137 static inline void volk_16i_max_star_horizontal_16i_neon(
int16_t* target,
int16_t* src0,
unsigned int num_points) {
138 const unsigned int eighth_points = num_points / 16;
140 int16x8x2_t input_vec;
141 int16x8_t diff, max_vec, zeros;
142 uint16x8_t comp1, comp2;
143 zeros = veorq_s16(zeros, zeros);
144 for(number=0; number < eighth_points; ++number) {
145 input_vec = vld2q_s16(src0);
147 diff = vsubq_s16(input_vec.val[0], input_vec.val[1]);
148 comp1 = vcgeq_s16(diff, zeros);
149 comp2 = vcltq_s16(diff, zeros);
151 input_vec.val[0] = vandq_s16(input_vec.val[0], (int16x8_t)comp1);
152 input_vec.val[1] = vandq_s16(input_vec.val[1], (int16x8_t)comp2);
154 max_vec = vaddq_s16(input_vec.val[0], input_vec.val[1]);
155 vst1q_s16(target, max_vec);
159 for(number=0; number < num_points%16; number+=2) {
160 target[number >> 1] = ((
int16_t)(src0[number] - src0[number + 1]) > 0) ? src0[number] : src0[number+1];
167 extern void volk_16i_max_star_horizontal_16i_neonasm(
int16_t* target,
int16_t* src0,
unsigned int num_points);
170 #ifdef LV_HAVE_GENERIC
171 static inline void volk_16i_max_star_horizontal_16i_generic(
int16_t* target,
int16_t* src0,
unsigned int num_points) {
173 const unsigned int num_bytes = num_points*2;
177 int bound = num_bytes >> 1;
180 for(i = 0; i < bound; i += 2) {
181 target[i >> 1] = ((
int16_t) (src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i+1];
#define bit128_p(x)
Definition: volk_common.h:94
unsigned char uint8_t
Definition: stdint.h:78
signed short int16_t
Definition: stdint.h:76
signed char int8_t
Definition: stdint.h:75