GNU Radio Manual and C++ API Reference  3.7.6.1
The Free & Open Software Radio Ecosystem
volk_16i_x4_quad_max_star_16i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 #ifndef INCLUDED_volk_16i_x4_quad_max_star_16i_a_H
24 #define INCLUDED_volk_16i_x4_quad_max_star_16i_a_H
25 
26 
27 #include<inttypes.h>
28 #include<stdio.h>
29 
30 
31 
32 
33 
34 #ifdef LV_HAVE_SSE2
35 
36 #include<emmintrin.h>
37 
38 static inline void volk_16i_x4_quad_max_star_16i_a_sse2(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_points) {
39 
40  const unsigned int num_bytes = num_points*2;
41 
42  int i = 0;
43 
44  int bound = (num_bytes >> 4);
45  int bound_copy = bound;
46  int leftovers = (num_bytes >> 1) & 7;
47 
48  __m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3;
49  p_target = (__m128i*) target;
50  p_src0 = (__m128i*)src0;
51  p_src1 = (__m128i*)src1;
52  p_src2 = (__m128i*)src2;
53  p_src3 = (__m128i*)src3;
54 
55 
56 
57  __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
58 
59  while(bound_copy > 0) {
60 
61  xmm1 = _mm_load_si128(p_src0);
62  xmm2 = _mm_load_si128(p_src1);
63  xmm3 = _mm_load_si128(p_src2);
64  xmm4 = _mm_load_si128(p_src3);
65 
66  xmm5 = _mm_setzero_si128();
67  xmm6 = _mm_setzero_si128();
68  xmm7 = xmm1;
69  xmm8 = xmm3;
70 
71 
72  xmm1 = _mm_sub_epi16(xmm2, xmm1);
73 
74 
75 
76  xmm3 = _mm_sub_epi16(xmm4, xmm3);
77 
78  xmm5 = _mm_cmpgt_epi16(xmm1, xmm5);
79  xmm6 = _mm_cmpgt_epi16(xmm3, xmm6);
80 
81 
82 
83  xmm2 = _mm_and_si128(xmm5, xmm2);
84  xmm4 = _mm_and_si128(xmm6, xmm4);
85  xmm5 = _mm_andnot_si128(xmm5, xmm7);
86  xmm6 = _mm_andnot_si128(xmm6, xmm8);
87 
88  xmm5 = _mm_add_epi16(xmm2, xmm5);
89  xmm6 = _mm_add_epi16(xmm4, xmm6);
90 
91 
92  xmm1 = _mm_xor_si128(xmm1, xmm1);
93  xmm2 = xmm5;
94  xmm5 = _mm_sub_epi16(xmm6, xmm5);
95  p_src0 += 1;
96  bound_copy -= 1;
97 
98  xmm1 = _mm_cmpgt_epi16(xmm5, xmm1);
99  p_src1 += 1;
100 
101  xmm6 = _mm_and_si128(xmm1, xmm6);
102 
103  xmm1 = _mm_andnot_si128(xmm1, xmm2);
104  p_src2 += 1;
105 
106 
107 
108  xmm1 = _mm_add_epi16(xmm6, xmm1);
109  p_src3 += 1;
110 
111 
112  _mm_store_si128(p_target, xmm1);
113  p_target += 1;
114 
115  }
116 
117 
118  /*asm volatile
119  (
120  "volk_16i_x4_quad_max_star_16i_a_sse2_L1:\n\t"
121  "cmp $0, %[bound]\n\t"
122  "je volk_16i_x4_quad_max_star_16i_a_sse2_END\n\t"
123 
124  "movaps (%[src0]), %%xmm1\n\t"
125  "movaps (%[src1]), %%xmm2\n\t"
126  "movaps (%[src2]), %%xmm3\n\t"
127  "movaps (%[src3]), %%xmm4\n\t"
128 
129  "pxor %%xmm5, %%xmm5\n\t"
130  "pxor %%xmm6, %%xmm6\n\t"
131  "movaps %%xmm1, %%xmm7\n\t"
132  "movaps %%xmm3, %%xmm8\n\t"
133  "psubw %%xmm2, %%xmm1\n\t"
134  "psubw %%xmm4, %%xmm3\n\t"
135 
136  "pcmpgtw %%xmm1, %%xmm5\n\t"
137  "pcmpgtw %%xmm3, %%xmm6\n\t"
138 
139  "pand %%xmm5, %%xmm2\n\t"
140  "pand %%xmm6, %%xmm4\n\t"
141  "pandn %%xmm7, %%xmm5\n\t"
142  "pandn %%xmm8, %%xmm6\n\t"
143 
144  "paddw %%xmm2, %%xmm5\n\t"
145  "paddw %%xmm4, %%xmm6\n\t"
146 
147  "pxor %%xmm1, %%xmm1\n\t"
148  "movaps %%xmm5, %%xmm2\n\t"
149 
150  "psubw %%xmm6, %%xmm5\n\t"
151  "add $16, %[src0]\n\t"
152  "add $-1, %[bound]\n\t"
153 
154  "pcmpgtw %%xmm5, %%xmm1\n\t"
155  "add $16, %[src1]\n\t"
156 
157  "pand %%xmm1, %%xmm6\n\t"
158 
159  "pandn %%xmm2, %%xmm1\n\t"
160  "add $16, %[src2]\n\t"
161 
162  "paddw %%xmm6, %%xmm1\n\t"
163  "add $16, %[src3]\n\t"
164 
165  "movaps %%xmm1, (%[target])\n\t"
166  "addw $16, %[target]\n\t"
167  "jmp volk_16i_x4_quad_max_star_16i_a_sse2_L1\n\t"
168 
169  "volk_16i_x4_quad_max_star_16i_a_sse2_END:\n\t"
170  :
171  :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [target]"r"(target)
172  :
173  );
174  */
175 
176  short temp0 = 0;
177  short temp1 = 0;
178  for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
179  temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
180  temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i];
181  target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1;
182  }
183  return;
184 
185 
186 }
187 
188 #endif /*LV_HAVE_SSE2*/
189 
190 #ifdef LV_HAVE_NEON
191 #include <arm_neon.h>
192 static inline void volk_16i_x4_quad_max_star_16i_neon(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_points) {
193  const unsigned int eighth_points = num_points / 8;
194  unsigned i;
195 
196  int16x8_t src0_vec, src1_vec, src2_vec, src3_vec;
197  int16x8_t diff12, diff34;
198  int16x8_t comp0, comp1, comp2, comp3;
199  int16x8_t result1_vec, result2_vec;
200  int16x8_t zeros;
201  zeros = veorq_s16(zeros, zeros);
202  for(i=0; i < eighth_points; ++i) {
203  src0_vec = vld1q_s16(src0);
204  src1_vec = vld1q_s16(src1);
205  src2_vec = vld1q_s16(src2);
206  src3_vec = vld1q_s16(src3);
207  diff12 = vsubq_s16(src0_vec, src1_vec);
208  diff34 = vsubq_s16(src2_vec, src3_vec);
209  comp0 = (int16x8_t)vcgeq_s16(diff12, zeros);
210  comp1 = (int16x8_t)vcltq_s16(diff12, zeros);
211  comp2 = (int16x8_t)vcgeq_s16(diff34, zeros);
212  comp3 = (int16x8_t)vcltq_s16(diff34, zeros);
213  comp0 = vandq_s16(src0_vec, comp0);
214  comp1 = vandq_s16(src1_vec, comp1);
215  comp2 = vandq_s16(src2_vec, comp2);
216  comp3 = vandq_s16(src3_vec, comp3);
217 
218  result1_vec = vaddq_s16(comp0, comp1);
219  result2_vec = vaddq_s16(comp2, comp3);
220 
221  diff12 = vsubq_s16(result1_vec, result2_vec);
222  comp0 = (int16x8_t)vcgeq_s16(diff12, zeros);
223  comp1 = (int16x8_t)vcltq_s16(diff12, zeros);
224  comp0 = vandq_s16(result1_vec, comp0);
225  comp1 = vandq_s16(result2_vec, comp1);
226  result1_vec = vaddq_s16(comp0, comp1);
227  vst1q_s16(target, result1_vec);
228  src0 += 8;
229  src1 += 8;
230  src2 += 8;
231  src3 += 8;
232  target += 8;
233  }
234 
235 
236  short temp0 = 0;
237  short temp1 = 0;
238  for(i=eighth_points*8; i < num_points; ++i) {
239  temp0 = ((short)(*src0 - *src1) > 0) ? *src0 : *src1;
240  temp1 = ((short)(*src2 - *src3) > 0) ? *src2 : *src3;
241  *target++ = ((short)(temp0 - temp1)>0) ? temp0 : temp1;
242  src0++;
243  src1++;
244  src2++;
245  src3++;
246  }
247 }
248 #endif /* LV_HAVE_NEON */
249 
250 
251 #ifdef LV_HAVE_GENERIC
252 static inline void volk_16i_x4_quad_max_star_16i_generic(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_points) {
253 
254  const unsigned int num_bytes = num_points*2;
255 
256  int i = 0;
257 
258  int bound = num_bytes >> 1;
259 
260  short temp0 = 0;
261  short temp1 = 0;
262  for(i = 0; i < bound; ++i) {
263  temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
264  temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i];
265  target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1;
266  }
267 }
268 
269 
270 
271 
272 #endif /*LV_HAVE_GENERIC*/
273 
274 #endif /*INCLUDED_volk_16i_x4_quad_max_star_16i_a_H*/