GNU Radio Manual and C++ API Reference  3.7.6.1
The Free & Open Software Radio Ecosystem
volk_32f_x2_dot_prod_16i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 #ifndef INCLUDED_volk_32f_x2_dot_prod_16i_H
24 #define INCLUDED_volk_32f_x2_dot_prod_16i_H
25 
26 #include <volk/volk_common.h>
27 #include<stdio.h>
28 
29 
30 #ifdef LV_HAVE_GENERIC
31 
32 
33 static inline void volk_32f_x2_dot_prod_16i_generic(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
34 
35  float dotProduct = 0;
36  const float* aPtr = input;
37  const float* bPtr= taps;
38  unsigned int number = 0;
39 
40  for(number = 0; number < num_points; number++){
41  dotProduct += ((*aPtr++) * (*bPtr++));
42  }
43 
44  *result = (int16_t)dotProduct;
45 }
46 
47 #endif /*LV_HAVE_GENERIC*/
48 
49 
50 #ifdef LV_HAVE_SSE
51 
52 static inline void volk_32f_x2_dot_prod_16i_a_sse(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
53 
54  unsigned int number = 0;
55  const unsigned int sixteenthPoints = num_points / 16;
56 
57  float dotProduct = 0;
58  const float* aPtr = input;
59  const float* bPtr = taps;
60 
61  __m128 a0Val, a1Val, a2Val, a3Val;
62  __m128 b0Val, b1Val, b2Val, b3Val;
63  __m128 c0Val, c1Val, c2Val, c3Val;
64 
65  __m128 dotProdVal0 = _mm_setzero_ps();
66  __m128 dotProdVal1 = _mm_setzero_ps();
67  __m128 dotProdVal2 = _mm_setzero_ps();
68  __m128 dotProdVal3 = _mm_setzero_ps();
69 
70  for(;number < sixteenthPoints; number++){
71 
72  a0Val = _mm_load_ps(aPtr);
73  a1Val = _mm_load_ps(aPtr+4);
74  a2Val = _mm_load_ps(aPtr+8);
75  a3Val = _mm_load_ps(aPtr+12);
76  b0Val = _mm_load_ps(bPtr);
77  b1Val = _mm_load_ps(bPtr+4);
78  b2Val = _mm_load_ps(bPtr+8);
79  b3Val = _mm_load_ps(bPtr+12);
80 
81  c0Val = _mm_mul_ps(a0Val, b0Val);
82  c1Val = _mm_mul_ps(a1Val, b1Val);
83  c2Val = _mm_mul_ps(a2Val, b2Val);
84  c3Val = _mm_mul_ps(a3Val, b3Val);
85 
86  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
87  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
88  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
89  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
90 
91  aPtr += 16;
92  bPtr += 16;
93  }
94 
95  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
96  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
97  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
98 
99  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
100 
101  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
102 
103  dotProduct = dotProductVector[0];
104  dotProduct += dotProductVector[1];
105  dotProduct += dotProductVector[2];
106  dotProduct += dotProductVector[3];
107 
108  number = sixteenthPoints*16;
109  for(;number < num_points; number++){
110  dotProduct += ((*aPtr++) * (*bPtr++));
111  }
112 
113  *result = (short)dotProduct;
114 }
115 
116 #endif /*LV_HAVE_SSE*/
117 
118 
119 #ifdef LV_HAVE_SSE
120 
121 static inline void volk_32f_x2_dot_prod_16i_u_sse(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
122 
123  unsigned int number = 0;
124  const unsigned int sixteenthPoints = num_points / 16;
125 
126  float dotProduct = 0;
127  const float* aPtr = input;
128  const float* bPtr = taps;
129 
130  __m128 a0Val, a1Val, a2Val, a3Val;
131  __m128 b0Val, b1Val, b2Val, b3Val;
132  __m128 c0Val, c1Val, c2Val, c3Val;
133 
134  __m128 dotProdVal0 = _mm_setzero_ps();
135  __m128 dotProdVal1 = _mm_setzero_ps();
136  __m128 dotProdVal2 = _mm_setzero_ps();
137  __m128 dotProdVal3 = _mm_setzero_ps();
138 
139  for(;number < sixteenthPoints; number++){
140 
141  a0Val = _mm_loadu_ps(aPtr);
142  a1Val = _mm_loadu_ps(aPtr+4);
143  a2Val = _mm_loadu_ps(aPtr+8);
144  a3Val = _mm_loadu_ps(aPtr+12);
145  b0Val = _mm_loadu_ps(bPtr);
146  b1Val = _mm_loadu_ps(bPtr+4);
147  b2Val = _mm_loadu_ps(bPtr+8);
148  b3Val = _mm_loadu_ps(bPtr+12);
149 
150  c0Val = _mm_mul_ps(a0Val, b0Val);
151  c1Val = _mm_mul_ps(a1Val, b1Val);
152  c2Val = _mm_mul_ps(a2Val, b2Val);
153  c3Val = _mm_mul_ps(a3Val, b3Val);
154 
155  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
156  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
157  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
158  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
159 
160  aPtr += 16;
161  bPtr += 16;
162  }
163 
164  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
165  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
166  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
167 
168  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
169 
170  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
171 
172  dotProduct = dotProductVector[0];
173  dotProduct += dotProductVector[1];
174  dotProduct += dotProductVector[2];
175  dotProduct += dotProductVector[3];
176 
177  number = sixteenthPoints*16;
178  for(;number < num_points; number++){
179  dotProduct += ((*aPtr++) * (*bPtr++));
180  }
181 
182  *result = (short)dotProduct;
183 }
184 
185 #endif /*LV_HAVE_SSE*/
186 
187 #endif /*INCLUDED_volk_32f_x2_dot_prod_16i_H*/
signed short int16_t
Definition: stdint.h:76
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:27
static const float taps[NSTEPS+1][NTAPS]
Definition: interpolator_taps.h:9