23 #ifndef INCLUDED_volk_16i_x4_quad_max_star_16i_a_H
24 #define INCLUDED_volk_16i_x4_quad_max_star_16i_a_H
38 static inline void volk_16i_x4_quad_max_star_16i_a_sse2(
short* target,
short* src0,
short* src1,
short* src2,
short* src3,
unsigned int num_points) {
40 const unsigned int num_bytes = num_points*2;
44 int bound = (num_bytes >> 4);
45 int bound_copy = bound;
46 int leftovers = (num_bytes >> 1) & 7;
48 __m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3;
49 p_target = (__m128i*) target;
50 p_src0 = (__m128i*)src0;
51 p_src1 = (__m128i*)src1;
52 p_src2 = (__m128i*)src2;
53 p_src3 = (__m128i*)src3;
57 __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
59 while(bound_copy > 0) {
61 xmm1 = _mm_load_si128(p_src0);
62 xmm2 = _mm_load_si128(p_src1);
63 xmm3 = _mm_load_si128(p_src2);
64 xmm4 = _mm_load_si128(p_src3);
66 xmm5 = _mm_setzero_si128();
67 xmm6 = _mm_setzero_si128();
72 xmm1 = _mm_sub_epi16(xmm2, xmm1);
76 xmm3 = _mm_sub_epi16(xmm4, xmm3);
78 xmm5 = _mm_cmpgt_epi16(xmm1, xmm5);
79 xmm6 = _mm_cmpgt_epi16(xmm3, xmm6);
83 xmm2 = _mm_and_si128(xmm5, xmm2);
84 xmm4 = _mm_and_si128(xmm6, xmm4);
85 xmm5 = _mm_andnot_si128(xmm5, xmm7);
86 xmm6 = _mm_andnot_si128(xmm6, xmm8);
88 xmm5 = _mm_add_epi16(xmm2, xmm5);
89 xmm6 = _mm_add_epi16(xmm4, xmm6);
92 xmm1 = _mm_xor_si128(xmm1, xmm1);
94 xmm5 = _mm_sub_epi16(xmm6, xmm5);
98 xmm1 = _mm_cmpgt_epi16(xmm5, xmm1);
101 xmm6 = _mm_and_si128(xmm1, xmm6);
103 xmm1 = _mm_andnot_si128(xmm1, xmm2);
108 xmm1 = _mm_add_epi16(xmm6, xmm1);
112 _mm_store_si128(p_target, xmm1);
178 for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
179 temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
180 temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i];
181 target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1;
191 #include <arm_neon.h>
192 static inline void volk_16i_x4_quad_max_star_16i_neon(
short* target,
short* src0,
short* src1,
short* src2,
short* src3,
unsigned int num_points) {
193 const unsigned int eighth_points = num_points / 8;
196 int16x8_t src0_vec, src1_vec, src2_vec, src3_vec;
197 int16x8_t diff12, diff34;
198 int16x8_t comp0, comp1, comp2, comp3;
199 int16x8_t result1_vec, result2_vec;
201 zeros = veorq_s16(zeros, zeros);
202 for(i=0; i < eighth_points; ++i) {
203 src0_vec = vld1q_s16(src0);
204 src1_vec = vld1q_s16(src1);
205 src2_vec = vld1q_s16(src2);
206 src3_vec = vld1q_s16(src3);
207 diff12 = vsubq_s16(src0_vec, src1_vec);
208 diff34 = vsubq_s16(src2_vec, src3_vec);
209 comp0 = (int16x8_t)vcgeq_s16(diff12, zeros);
210 comp1 = (int16x8_t)vcltq_s16(diff12, zeros);
211 comp2 = (int16x8_t)vcgeq_s16(diff34, zeros);
212 comp3 = (int16x8_t)vcltq_s16(diff34, zeros);
213 comp0 = vandq_s16(src0_vec, comp0);
214 comp1 = vandq_s16(src1_vec, comp1);
215 comp2 = vandq_s16(src2_vec, comp2);
216 comp3 = vandq_s16(src3_vec, comp3);
218 result1_vec = vaddq_s16(comp0, comp1);
219 result2_vec = vaddq_s16(comp2, comp3);
221 diff12 = vsubq_s16(result1_vec, result2_vec);
222 comp0 = (int16x8_t)vcgeq_s16(diff12, zeros);
223 comp1 = (int16x8_t)vcltq_s16(diff12, zeros);
224 comp0 = vandq_s16(result1_vec, comp0);
225 comp1 = vandq_s16(result2_vec, comp1);
226 result1_vec = vaddq_s16(comp0, comp1);
227 vst1q_s16(target, result1_vec);
238 for(i=eighth_points*8; i < num_points; ++i) {
239 temp0 = ((short)(*src0 - *src1) > 0) ? *src0 : *src1;
240 temp1 = ((short)(*src2 - *src3) > 0) ? *src2 : *src3;
241 *target++ = ((short)(temp0 - temp1)>0) ? temp0 : temp1;
251 #ifdef LV_HAVE_GENERIC
252 static inline void volk_16i_x4_quad_max_star_16i_generic(
short* target,
short* src0,
short* src1,
short* src2,
short* src3,
unsigned int num_points) {
254 const unsigned int num_bytes = num_points*2;
258 int bound = num_bytes >> 1;
262 for(i = 0; i < bound; ++i) {
263 temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
264 temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i];
265 target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1;