23 #ifndef INCLUDED_volk_8u_x4_conv_k7_r2_8u_H
24 #define INCLUDED_volk_8u_x4_conv_k7_r2_8u_H
29 unsigned char t[64/8];
30 unsigned int w[64/32];
31 unsigned short s[64/16];
32 unsigned char c[64/8];
39 static inline void renormalize(
unsigned char* X,
unsigned char threshold){
43 unsigned char min=X[0];
45 for(i=0;i<NUMSTATES;i++)
48 for(i=0;i<NUMSTATES;i++)
56 static inline void BFLY(
int i,
int s,
unsigned char * syms,
unsigned char *Y,
unsigned char *X,
decision_t * d,
unsigned char* Branchtab) {
57 int j, decision0, decision1;
58 unsigned char metric,m0,m1,m2,m3;
63 int PRECISIONSHIFT = 2;
69 metric += (Branchtab[i+j*NUMSTATES/2] ^ syms[s*RATE+j])>>METRICSHIFT ;
70 metric=metric>>PRECISIONSHIFT;
72 unsigned char max = ((RATE*((256 -1)>>METRICSHIFT))>>PRECISIONSHIFT);
75 m1 = X[i+NUMSTATES/2] + (max - metric);
76 m2 = X[i] + (max - metric);
77 m3 = X[i+NUMSTATES/2] + metric;
79 decision0 = (
signed int)(m0-m1) > 0;
80 decision1 = (
signed int)(m2-m3) > 0;
82 Y[2*i] = decision0 ? m1 : m0;
83 Y[2*i+1] = decision1 ? m3 : m2;
85 d->
w[i/(
sizeof(
unsigned int)*8/2)+s*(
sizeof(
decision_t)/
sizeof(
unsigned int))] |=
86 (decision0|decision1<<1) << ((2*i)&(
sizeof(
unsigned int)*8-1));
92 #include <pmmintrin.h>
93 #include <emmintrin.h>
94 #include <xmmintrin.h>
98 static inline void volk_8u_x4_conv_k7_r2_8u_spiral(
unsigned char* Y,
unsigned char* X,
unsigned char* syms,
unsigned char* dec,
unsigned int framebits,
unsigned int excess,
unsigned char* Branchtab) {
100 for(i9 = 0; i9 < ((framebits + excess) >> 1); i9++) {
101 unsigned char a75, a81;
103 short int s20, s21, s26, s27;
104 unsigned char *a74, *a80, *b6;
105 short int *a110, *a111, *a91, *a93, *a94;
106 __m128i *a102, *a112, *a113, *a71, *a72, *a77, *a83
107 , *a95, *a96, *a97, *a98, *a99;
108 __m128i a105, a106, a86, a87;
109 __m128i a100, a101, a103, a104, a107, a108, a109
110 , a76, a78, a79, a82, a84, a85, a88, a89
111 , a90, d10, d11, d12, d9, m23, m24, m25
112 , m26, m27, m28, m29, m30, s18, s19, s22
113 , s23, s24, s25, s28, s29, t13, t14, t15
115 a71 = ((__m128i *) X);
122 a76 = _mm_set1_epi8(a75);
123 a77 = ((__m128i *) Branchtab);
125 a79 = _mm_xor_si128(a76, a78);
129 a82 = _mm_set1_epi8(a81);
132 a85 = _mm_xor_si128(a82, a84);
133 t13 = _mm_avg_epu8(a79,a85);
134 a86 = ((__m128i ) t13);
135 a87 = _mm_srli_epi16(a86, 2);
136 a88 = ((__m128i ) a87);
137 t14 = _mm_and_si128(a88, _mm_set_epi8(63, 63, 63, 63, 63, 63, 63
138 , 63, 63, 63, 63, 63, 63, 63, 63
140 t15 = _mm_subs_epu8(_mm_set_epi8(63, 63, 63, 63, 63, 63, 63
141 , 63, 63, 63, 63, 63, 63, 63, 63
143 m23 = _mm_adds_epu8(s18, t14);
144 m24 = _mm_adds_epu8(s19, t15);
145 m25 = _mm_adds_epu8(s18, t15);
146 m26 = _mm_adds_epu8(s19, t14);
147 a89 = _mm_min_epu8(m24, m23);
148 d9 = _mm_cmpeq_epi8(a89, m24);
149 a90 = _mm_min_epu8(m26, m25);
150 d10 = _mm_cmpeq_epi8(a90, m26);
151 s20 = _mm_movemask_epi8(_mm_unpacklo_epi8(d9,d10));
152 a91 = ((
short int *) dec);
156 s21 = _mm_movemask_epi8(_mm_unpackhi_epi8(d9,d10));
159 s22 = _mm_unpacklo_epi8(a89, a90);
160 s23 = _mm_unpackhi_epi8(a89, a90);
161 a95 = ((__m128i *) Y);
171 a101 = _mm_xor_si128(a76, a100);
174 a104 = _mm_xor_si128(a82, a103);
175 t16 = _mm_avg_epu8(a101,a104);
176 a105 = ((__m128i ) t16);
177 a106 = _mm_srli_epi16(a105, 2);
178 a107 = ((__m128i ) a106);
179 t17 = _mm_and_si128(a107, _mm_set_epi8(63, 63, 63, 63, 63, 63, 63
180 , 63, 63, 63, 63, 63, 63, 63, 63
182 t18 = _mm_subs_epu8(_mm_set_epi8(63, 63, 63, 63, 63, 63, 63
183 , 63, 63, 63, 63, 63, 63, 63, 63
185 m27 = _mm_adds_epu8(s24, t17);
186 m28 = _mm_adds_epu8(s25, t18);
187 m29 = _mm_adds_epu8(s24, t18);
188 m30 = _mm_adds_epu8(s25, t17);
189 a108 = _mm_min_epu8(m28, m27);
190 d11 = _mm_cmpeq_epi8(a108, m28);
191 a109 = _mm_min_epu8(m30, m29);
192 d12 = _mm_cmpeq_epi8(a109, m30);
193 s26 = _mm_movemask_epi8(_mm_unpacklo_epi8(d11,d12));
196 s27 = _mm_movemask_epi8(_mm_unpackhi_epi8(d11,d12));
199 s28 = _mm_unpacklo_epi8(a108, a109);
200 s29 = _mm_unpackhi_epi8(a108, a109);
205 if ((((
unsigned char *) Y)[0]>210)) {
207 m5 = ((__m128i *) Y)[0];
208 m5 = _mm_min_epu8(m5, ((__m128i *) Y)[1]);
209 m5 = _mm_min_epu8(m5, ((__m128i *) Y)[2]);
210 m5 = _mm_min_epu8(m5, ((__m128i *) Y)[3]);
212 m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5);
213 m7 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m7, 32)), ((__m128i ) m7)));
214 m7 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m7, 16)), ((__m128i ) m7)));
215 m7 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m7, 8)), ((__m128i ) m7)));
216 m7 = _mm_unpacklo_epi8(m7, m7);
217 m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0));
218 m6 = _mm_unpacklo_epi64(m7, m7);
219 ((__m128i *) Y)[0] = _mm_subs_epu8(((__m128i *) Y)[0], m6);
220 ((__m128i *) Y)[1] = _mm_subs_epu8(((__m128i *) Y)[1], m6);
221 ((__m128i *) Y)[2] = _mm_subs_epu8(((__m128i *) Y)[2], m6);
222 ((__m128i *) Y)[3] = _mm_subs_epu8(((__m128i *) Y)[3], m6);
224 unsigned char a188, a194;
226 short int s48, s49, s54, s55;
227 unsigned char *a187, *a193, *b15;
228 short int *a204, *a206, *a207, *a223, *a224, *b16;
229 __m128i *a184, *a185, *a190, *a196, *a208, *a209, *a210
230 , *a211, *a212, *a215, *a225, *a226;
231 __m128i a199, a200, a218, a219;
232 __m128i a189, a191, a192, a195, a197, a198, a201
233 , a202, a203, a213, a214, a216, a217, a220, a221
234 , a222, d17, d18, d19, d20, m39, m40, m41
235 , m42, m43, m44, m45, m46, s46, s47, s50
236 , s51, s52, s53, s56, s57, t25, t26, t27
238 a184 = ((__m128i *) Y);
246 a189 = _mm_set1_epi8(a188);
247 a190 = ((__m128i *) Branchtab);
249 a192 = _mm_xor_si128(a189, a191);
252 a195 = _mm_set1_epi8(a194);
255 a198 = _mm_xor_si128(a195, a197);
256 t25 = _mm_avg_epu8(a192,a198);
257 a199 = ((__m128i ) t25);
258 a200 = _mm_srli_epi16(a199, 2);
259 a201 = ((__m128i ) a200);
260 t26 = _mm_and_si128(a201, _mm_set_epi8(63, 63, 63, 63, 63, 63, 63
261 , 63, 63, 63, 63, 63, 63, 63, 63
263 t27 = _mm_subs_epu8(_mm_set_epi8(63, 63, 63, 63, 63, 63, 63
264 , 63, 63, 63, 63, 63, 63, 63, 63
266 m39 = _mm_adds_epu8(s46, t26);
267 m40 = _mm_adds_epu8(s47, t27);
268 m41 = _mm_adds_epu8(s46, t27);
269 m42 = _mm_adds_epu8(s47, t26);
270 a202 = _mm_min_epu8(m40, m39);
271 d17 = _mm_cmpeq_epi8(a202, m40);
272 a203 = _mm_min_epu8(m42, m41);
273 d18 = _mm_cmpeq_epi8(a203, m42);
274 s48 = _mm_movemask_epi8(_mm_unpacklo_epi8(d17,d18));
275 a204 = ((
short int *) dec);
280 s49 = _mm_movemask_epi8(_mm_unpackhi_epi8(d17,d18));
283 s50 = _mm_unpacklo_epi8(a202, a203);
284 s51 = _mm_unpackhi_epi8(a202, a203);
285 a208 = ((__m128i *) X);
295 a214 = _mm_xor_si128(a189, a213);
298 a217 = _mm_xor_si128(a195, a216);
299 t28 = _mm_avg_epu8(a214,a217);
300 a218 = ((__m128i ) t28);
301 a219 = _mm_srli_epi16(a218, 2);
302 a220 = ((__m128i ) a219);
303 t29 = _mm_and_si128(a220, _mm_set_epi8(63, 63, 63, 63, 63, 63, 63
304 , 63, 63, 63, 63, 63, 63, 63, 63
306 t30 = _mm_subs_epu8(_mm_set_epi8(63, 63, 63, 63, 63, 63, 63
307 , 63, 63, 63, 63, 63, 63, 63, 63
309 m43 = _mm_adds_epu8(s52, t29);
310 m44 = _mm_adds_epu8(s53, t30);
311 m45 = _mm_adds_epu8(s52, t30);
312 m46 = _mm_adds_epu8(s53, t29);
313 a221 = _mm_min_epu8(m44, m43);
314 d19 = _mm_cmpeq_epi8(a221, m44);
315 a222 = _mm_min_epu8(m46, m45);
316 d20 = _mm_cmpeq_epi8(a222, m46);
317 s54 = _mm_movemask_epi8(_mm_unpacklo_epi8(d19,d20));
320 s55 = _mm_movemask_epi8(_mm_unpackhi_epi8(d19,d20));
323 s56 = _mm_unpacklo_epi8(a221, a222);
324 s57 = _mm_unpackhi_epi8(a221, a222);
329 if ((((
unsigned char *) X)[0]>210)) {
331 m12 = ((__m128i *) X)[0];
332 m12 = _mm_min_epu8(m12, ((__m128i *) X)[1]);
333 m12 = _mm_min_epu8(m12, ((__m128i *) X)[2]);
334 m12 = _mm_min_epu8(m12, ((__m128i *) X)[3]);
336 m14 = _mm_min_epu8(_mm_srli_si128(m12, 8), m12);
337 m14 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m14, 32)), ((__m128i ) m14)));
338 m14 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m14, 16)), ((__m128i ) m14)));
339 m14 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m14, 8)), ((__m128i ) m14)));
340 m14 = _mm_unpacklo_epi8(m14, m14);
341 m14 = _mm_shufflelo_epi16(m14, _MM_SHUFFLE(0, 0, 0, 0));
342 m13 = _mm_unpacklo_epi64(m14, m14);
343 ((__m128i *) X)[0] = _mm_subs_epu8(((__m128i *) X)[0], m13);
344 ((__m128i *) X)[1] = _mm_subs_epu8(((__m128i *) X)[1], m13);
345 ((__m128i *) X)[2] = _mm_subs_epu8(((__m128i *) X)[2], m13);
346 ((__m128i *) X)[3] = _mm_subs_epu8(((__m128i *) X)[3], m13);
359 for(j=0; j < (framebits + excess) % 2; ++j) {
362 BFLY(i, (((framebits+excess) >> 1) << 1) + j , syms, Y, X, (
decision_t *)dec, Branchtab);
390 static inline void volk_8u_x4_conv_k7_r2_8u_generic(
unsigned char* Y,
unsigned char* X,
unsigned char* syms,
unsigned char* dec,
unsigned int framebits,
unsigned int excess,
unsigned char* Branchtab) {
391 int nbits = framebits + excess;
393 int RENORMALIZE_THRESHOLD = 210;
400 for (s=0;s<nbits;s++){
402 for(i=0;i<NUMSTATES/2;i++){
413 Y = (
unsigned char*)tmp;
float min(float a, float b)
unsigned int * w
Definition: cc_common.h:36
static void renormalize(unsigned char *X, unsigned char threshold)
Definition: volk_8u_x4_conv_k7_r2_8u.h:39
Definition: cc_common.h:33
static void BFLY(int i, int s, unsigned char *syms, unsigned char *Y, unsigned char *X, decision_t *d, unsigned char *Branchtab)
Definition: volk_8u_x4_conv_k7_r2_8u.h:56