GNU Radio Manual and C++ API Reference  3.7.6.1
The Free & Open Software Radio Ecosystem
volk_16i_x5_add_quad_16i_x4.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 #ifndef INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H
24 #define INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H
25 
26 
27 #include<inttypes.h>
28 #include<stdio.h>
29 
30 
31 #ifdef LV_HAVE_SSE2
32 #include<xmmintrin.h>
33 #include<emmintrin.h>
34 
35 static inline void volk_16i_x5_add_quad_16i_x4_a_sse2(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_points) {
36 
37  const unsigned int num_bytes = num_points*2;
38 
39  __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
40  __m128i *p_target0, *p_target1, *p_target2, *p_target3, *p_src0, *p_src1, *p_src2, *p_src3, *p_src4;
41  p_target0 = (__m128i*)target0;
42  p_target1 = (__m128i*)target1;
43  p_target2 = (__m128i*)target2;
44  p_target3 = (__m128i*)target3;
45 
46  p_src0 = (__m128i*)src0;
47  p_src1 = (__m128i*)src1;
48  p_src2 = (__m128i*)src2;
49  p_src3 = (__m128i*)src3;
50  p_src4 = (__m128i*)src4;
51 
52  int i = 0;
53 
54  int bound = (num_bytes >> 4);
55  int leftovers = (num_bytes >> 1) & 7;
56 
57  for(; i < bound; ++i) {
58  xmm0 = _mm_load_si128(p_src0);
59  xmm1 = _mm_load_si128(p_src1);
60  xmm2 = _mm_load_si128(p_src2);
61  xmm3 = _mm_load_si128(p_src3);
62  xmm4 = _mm_load_si128(p_src4);
63 
64  p_src0 += 1;
65  p_src1 += 1;
66 
67  xmm1 = _mm_add_epi16(xmm0, xmm1);
68  xmm2 = _mm_add_epi16(xmm0, xmm2);
69  xmm3 = _mm_add_epi16(xmm0, xmm3);
70  xmm4 = _mm_add_epi16(xmm0, xmm4);
71 
72 
73  p_src2 += 1;
74  p_src3 += 1;
75  p_src4 += 1;
76 
77  _mm_store_si128(p_target0, xmm1);
78  _mm_store_si128(p_target1, xmm2);
79  _mm_store_si128(p_target2, xmm3);
80  _mm_store_si128(p_target3, xmm4);
81 
82  p_target0 += 1;
83  p_target1 += 1;
84  p_target2 += 1;
85  p_target3 += 1;
86  }
87  /*asm volatile
88  (
89  ".%=volk_16i_x5_add_quad_16i_x4_a_sse2_L1:\n\t"
90  "cmp $0, %[bound]\n\t"
91  "je .%=volk_16i_x5_add_quad_16i_x4_a_sse2_END\n\t"
92  "movaps (%[src0]), %%xmm1\n\t"
93  "movaps (%[src1]), %%xmm2\n\t"
94  "movaps (%[src2]), %%xmm3\n\t"
95  "movaps (%[src3]), %%xmm4\n\t"
96  "movaps (%[src4]), %%xmm5\n\t"
97  "add $16, %[src0]\n\t"
98  "add $16, %[src1]\n\t"
99  "add $16, %[src2]\n\t"
100  "add $16, %[src3]\n\t"
101  "add $16, %[src4]\n\t"
102  "paddw %%xmm1, %%xmm2\n\t"
103  "paddw %%xmm1, %%xmm3\n\t"
104  "paddw %%xmm1, %%xmm4\n\t"
105  "paddw %%xmm1, %%xmm5\n\t"
106  "add $-1, %[bound]\n\t"
107  "movaps %%xmm2, (%[target0])\n\t"
108  "movaps %%xmm3, (%[target1])\n\t"
109  "movaps %%xmm4, (%[target2])\n\t"
110  "movaps %%xmm5, (%[target3])\n\t"
111  "add $16, %[target0]\n\t"
112  "add $16, %[target1]\n\t"
113  "add $16, %[target2]\n\t"
114  "add $16, %[target3]\n\t"
115  "jmp .%=volk_16i_x5_add_quad_16i_x4_a_sse2_L1\n\t"
116  ".%=volk_16i_x5_add_quad_16i_x4_a_sse2_END:\n\t"
117  :
118  :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [src4]"r"(src4), [target0]"r"(target0), [target1]"r"(target1), [target2]"r"(target2), [target3]"r"(target3)
119  :"xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
120  );
121 
122  */
123 
124 
125  for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
126  target0[i] = src0[i] + src1[i];
127  target1[i] = src0[i] + src2[i];
128  target2[i] = src0[i] + src3[i];
129  target3[i] = src0[i] + src4[i];
130  }
131 }
132 #endif /*LV_HAVE_SSE2*/
133 
134 #ifdef LV_HAVE_NEON
135 #include <arm_neon.h>
136 static inline void volk_16i_x5_add_quad_16i_x4_neon(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_points) {
137 
138  const unsigned int eighth_points = num_points / 8;
139  unsigned int number = 0;
140 
141  int16x8_t src0_vec, src1_vec, src2_vec, src3_vec, src4_vec;
142  int16x8_t target0_vec, target1_vec, target2_vec, target3_vec;
143  for(number = 0; number < eighth_points; ++number) {
144  src0_vec = vld1q_s16(src0);
145  src1_vec = vld1q_s16(src1);
146  src2_vec = vld1q_s16(src2);
147  src3_vec = vld1q_s16(src3);
148  src4_vec = vld1q_s16(src4);
149 
150  target0_vec = vaddq_s16(src0_vec , src1_vec);
151  target1_vec = vaddq_s16(src0_vec , src2_vec);
152  target2_vec = vaddq_s16(src0_vec , src3_vec);
153  target3_vec = vaddq_s16(src0_vec , src4_vec);
154 
155  vst1q_s16(target0, target0_vec);
156  vst1q_s16(target1, target1_vec);
157  vst1q_s16(target2, target2_vec);
158  vst1q_s16(target3, target3_vec);
159  src0 += 8;
160  src1 += 8;
161  src2 += 8;
162  src3 += 8;
163  src4 += 8;
164  target0 += 8;
165  target1 += 8;
166  target2 += 8;
167  target3 += 8;
168  }
169 
170  for(number = eighth_points * 8; number < num_points; ++number) {
171  *target0++ = *src0 + *src1++;
172  *target1++ = *src0 + *src2++;
173  *target2++ = *src0 + *src3++;
174  *target3++ = *src0++ + *src4++;
175  }
176 }
177 
178 #endif /* LV_HAVE_NEON */
179 
180 
181 #ifdef LV_HAVE_GENERIC
182 
183 static inline void volk_16i_x5_add_quad_16i_x4_generic(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_points) {
184 
185  const unsigned int num_bytes = num_points*2;
186 
187  int i = 0;
188 
189  int bound = num_bytes >> 1;
190 
191  for(i = 0; i < bound; ++i) {
192  target0[i] = src0[i] + src1[i];
193  target1[i] = src0[i] + src2[i];
194  target2[i] = src0[i] + src3[i];
195  target3[i] = src0[i] + src4[i];
196  }
197 }
198 
199 #endif /* LV_HAVE_GENERIC */
200 
201 
202 
203 
204 
205 #endif /*INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H*/