GNU Radio Manual and C++ API Reference  3.7.6.1
The Free & Open Software Radio Ecosystem
volk_16i_max_star_horizontal_16i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 #ifndef INCLUDED_volk_16i_max_star_horizontal_16i_a_H
24 #define INCLUDED_volk_16i_max_star_horizontal_16i_a_H
25 
26 #include <volk/volk_common.h>
27 
28 #include<inttypes.h>
29 #include<stdio.h>
30 
31 
32 #ifdef LV_HAVE_SSSE3
33 
34 #include<xmmintrin.h>
35 #include<emmintrin.h>
36 #include<tmmintrin.h>
37 
38 static inline void volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target, int16_t* src0, unsigned int num_points) {
39 
40  const unsigned int num_bytes = num_points*2;
41 
42  const static uint8_t shufmask0[16] = {0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
43  const static uint8_t shufmask1[16] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d};
44  const static uint8_t andmask0[16] = {0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
45  const static uint8_t andmask1[16] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02};
46 
47 
48 
49  __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
50  __m128i xmm5, xmm6, xmm7, xmm8;
51 
52  xmm4 = _mm_load_si128((__m128i*)shufmask0);
53  xmm5 = _mm_load_si128((__m128i*)shufmask1);
54  xmm6 = _mm_load_si128((__m128i*)andmask0);
55  xmm7 = _mm_load_si128((__m128i*)andmask1);
56 
57  __m128i *p_target, *p_src0;
58 
59  p_target = (__m128i*)target;
60  p_src0 = (__m128i*)src0;
61 
62  int bound = num_bytes >> 5;
63  int intermediate = (num_bytes >> 4) & 1;
64  int leftovers = (num_bytes >> 1) & 7;
65 
66  int i = 0;
67 
68 
69  for(i = 0; i < bound; ++i) {
70 
71  xmm0 = _mm_load_si128(p_src0);
72  xmm1 = _mm_load_si128(&p_src0[1]);
73 
74 
75 
76  xmm2 = _mm_xor_si128(xmm2, xmm2);
77  p_src0 += 2;
78 
79  xmm3 = _mm_hsub_epi16(xmm0, xmm1);
80 
81  xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
82 
83  xmm8 = _mm_and_si128(xmm2, xmm6);
84  xmm3 = _mm_and_si128(xmm2, xmm7);
85 
86 
87  xmm8 = _mm_add_epi8(xmm8, xmm4);
88  xmm3 = _mm_add_epi8(xmm3, xmm5);
89 
90  xmm0 = _mm_shuffle_epi8(xmm0, xmm8);
91  xmm1 = _mm_shuffle_epi8(xmm1, xmm3);
92 
93 
94  xmm3 = _mm_add_epi16(xmm0, xmm1);
95 
96 
97  _mm_store_si128(p_target, xmm3);
98 
99  p_target += 1;
100 
101  }
102 
103  for(i = 0; i < intermediate; ++i) {
104 
105  xmm0 = _mm_load_si128(p_src0);
106 
107 
108  xmm2 = _mm_xor_si128(xmm2, xmm2);
109  p_src0 += 1;
110 
111  xmm3 = _mm_hsub_epi16(xmm0, xmm1);
112  xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
113 
114  xmm8 = _mm_and_si128(xmm2, xmm6);
115 
116  xmm3 = _mm_add_epi8(xmm8, xmm4);
117 
118  xmm0 = _mm_shuffle_epi8(xmm0, xmm3);
119 
120  _mm_storel_pd((double*)p_target, bit128_p(&xmm0)->double_vec);
121 
122  p_target = (__m128i*)((int8_t*)p_target + 8);
123 
124  }
125 
126  for(i = (bound << 4) + (intermediate << 3); i < (bound << 4) + (intermediate << 3) + leftovers ; i += 2) {
127  target[i>>1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1];
128  }
129 
130 
131 }
132 
133 #endif /*LV_HAVE_SSSE3*/
134 
135 #ifdef LV_HAVE_NEON
136 #include <arm_neon.h>
137 static inline void volk_16i_max_star_horizontal_16i_neon(int16_t* target, int16_t* src0, unsigned int num_points) {
138  const unsigned int eighth_points = num_points / 16;
139  unsigned number;
140  int16x8x2_t input_vec;
141  int16x8_t diff, max_vec, zeros;
142  uint16x8_t comp1, comp2;
143  zeros = veorq_s16(zeros, zeros);
144  for(number=0; number < eighth_points; ++number) {
145  input_vec = vld2q_s16(src0);
146  //__builtin_prefetch(src0+16);
147  diff = vsubq_s16(input_vec.val[0], input_vec.val[1]);
148  comp1 = vcgeq_s16(diff, zeros);
149  comp2 = vcltq_s16(diff, zeros);
150 
151  input_vec.val[0] = vandq_s16(input_vec.val[0], (int16x8_t)comp1);
152  input_vec.val[1] = vandq_s16(input_vec.val[1], (int16x8_t)comp2);
153 
154  max_vec = vaddq_s16(input_vec.val[0], input_vec.val[1]);
155  vst1q_s16(target, max_vec);
156  src0 += 16;
157  target += 8;
158  }
159  for(number=0; number < num_points%16; number+=2) {
160  target[number >> 1] = ((int16_t)(src0[number] - src0[number + 1]) > 0) ? src0[number] : src0[number+1];
161  }
162 
163 }
164 #endif /* LV_HAVE_NEON */
165 
166 #ifdef LV_HAVE_NEON
167 extern void volk_16i_max_star_horizontal_16i_neonasm(int16_t* target, int16_t* src0, unsigned int num_points);
168 #endif /* LV_HAVE_NEON */
169 
170 #ifdef LV_HAVE_GENERIC
171 static inline void volk_16i_max_star_horizontal_16i_generic(int16_t* target, int16_t* src0, unsigned int num_points) {
172 
173  const unsigned int num_bytes = num_points*2;
174 
175  int i = 0;
176 
177  int bound = num_bytes >> 1;
178 
179 
180  for(i = 0; i < bound; i += 2) {
181  target[i >> 1] = ((int16_t) (src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i+1];
182  }
183 
184 }
185 
186 
187 
188 #endif /*LV_HAVE_GENERIC*/
189 
190 #endif /*INCLUDED_volk_16i_max_star_horizontal_16i_a_H*/
#define bit128_p(x)
Definition: volk_common.h:94
unsigned char uint8_t
Definition: stdint.h:78
signed short int16_t
Definition: stdint.h:76
signed char int8_t
Definition: stdint.h:75