23 #ifndef INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H
24 #define INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H
35 static inline void volk_16i_x5_add_quad_16i_x4_a_sse2(
short* target0,
short* target1,
short* target2,
short* target3,
short* src0,
short* src1,
short* src2,
short* src3,
short* src4,
unsigned int num_points) {
37 const unsigned int num_bytes = num_points*2;
39 __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
40 __m128i *p_target0, *p_target1, *p_target2, *p_target3, *p_src0, *p_src1, *p_src2, *p_src3, *p_src4;
41 p_target0 = (__m128i*)target0;
42 p_target1 = (__m128i*)target1;
43 p_target2 = (__m128i*)target2;
44 p_target3 = (__m128i*)target3;
46 p_src0 = (__m128i*)src0;
47 p_src1 = (__m128i*)src1;
48 p_src2 = (__m128i*)src2;
49 p_src3 = (__m128i*)src3;
50 p_src4 = (__m128i*)src4;
54 int bound = (num_bytes >> 4);
55 int leftovers = (num_bytes >> 1) & 7;
57 for(; i < bound; ++i) {
58 xmm0 = _mm_load_si128(p_src0);
59 xmm1 = _mm_load_si128(p_src1);
60 xmm2 = _mm_load_si128(p_src2);
61 xmm3 = _mm_load_si128(p_src3);
62 xmm4 = _mm_load_si128(p_src4);
67 xmm1 = _mm_add_epi16(xmm0, xmm1);
68 xmm2 = _mm_add_epi16(xmm0, xmm2);
69 xmm3 = _mm_add_epi16(xmm0, xmm3);
70 xmm4 = _mm_add_epi16(xmm0, xmm4);
77 _mm_store_si128(p_target0, xmm1);
78 _mm_store_si128(p_target1, xmm2);
79 _mm_store_si128(p_target2, xmm3);
80 _mm_store_si128(p_target3, xmm4);
125 for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
126 target0[i] = src0[i] + src1[i];
127 target1[i] = src0[i] + src2[i];
128 target2[i] = src0[i] + src3[i];
129 target3[i] = src0[i] + src4[i];
135 #include <arm_neon.h>
136 static inline void volk_16i_x5_add_quad_16i_x4_neon(
short* target0,
short* target1,
short* target2,
short* target3,
short* src0,
short* src1,
short* src2,
short* src3,
short* src4,
unsigned int num_points) {
138 const unsigned int eighth_points = num_points / 8;
139 unsigned int number = 0;
141 int16x8_t src0_vec, src1_vec, src2_vec, src3_vec, src4_vec;
142 int16x8_t target0_vec, target1_vec, target2_vec, target3_vec;
143 for(number = 0; number < eighth_points; ++number) {
144 src0_vec = vld1q_s16(src0);
145 src1_vec = vld1q_s16(src1);
146 src2_vec = vld1q_s16(src2);
147 src3_vec = vld1q_s16(src3);
148 src4_vec = vld1q_s16(src4);
150 target0_vec = vaddq_s16(src0_vec , src1_vec);
151 target1_vec = vaddq_s16(src0_vec , src2_vec);
152 target2_vec = vaddq_s16(src0_vec , src3_vec);
153 target3_vec = vaddq_s16(src0_vec , src4_vec);
155 vst1q_s16(target0, target0_vec);
156 vst1q_s16(target1, target1_vec);
157 vst1q_s16(target2, target2_vec);
158 vst1q_s16(target3, target3_vec);
170 for(number = eighth_points * 8; number < num_points; ++number) {
171 *target0++ = *src0 + *src1++;
172 *target1++ = *src0 + *src2++;
173 *target2++ = *src0 + *src3++;
174 *target3++ = *src0++ + *src4++;
181 #ifdef LV_HAVE_GENERIC
183 static inline void volk_16i_x5_add_quad_16i_x4_generic(
short* target0,
short* target1,
short* target2,
short* target3,
short* src0,
short* src1,
short* src2,
short* src3,
short* src4,
unsigned int num_points) {
185 const unsigned int num_bytes = num_points*2;
189 int bound = num_bytes >> 1;
191 for(i = 0; i < bound; ++i) {
192 target0[i] = src0[i] + src1[i];
193 target1[i] = src0[i] + src2[i];
194 target2[i] = src0[i] + src3[i];
195 target3[i] = src0[i] + src4[i];