Main MRPT website > C++ reference
MRPT logo
CImage_SSE2.cpp
Go to the documentation of this file.
1 /* +---------------------------------------------------------------------------+
2  | The Mobile Robot Programming Toolkit (MRPT) C++ library |
3  | |
4  | http://www.mrpt.org/ |
5  | |
6  | Copyright (C) 2005-2012 University of Malaga |
7  | |
8  | This software was written by the Machine Perception and Intelligent |
9  | Robotics Lab, University of Malaga (Spain). |
10  | Contact: Jose-Luis Blanco <jlblanco@ctima.uma.es> |
11  | |
12  | This file is part of the MRPT project. |
13  | |
14  | MRPT is free software: you can redistribute it and/or modify |
15  | it under the terms of the GNU General Public License as published by |
16  | the Free Software Foundation, either version 3 of the License, or |
17  | (at your option) any later version. |
18  | |
19  | MRPT is distributed in the hope that it will be useful, |
20  | but WITHOUT ANY WARRANTY; without even the implied warranty of |
21  | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
22  | GNU General Public License for more details. |
23  | |
24  | You should have received a copy of the GNU General Public License |
25  | along with MRPT. If not, see <http://www.gnu.org/licenses/>. |
26  | |
27  +---------------------------------------------------------------------------+ */
28 
29 #include <mrpt/base.h> // Precompiled headers
30 
31 #if MRPT_HAS_SSE2
32 // ---------------------------------------------------------------------------
33 // This file contains the SSE2 optimized functions for mrpt::utils::CImage
34 // See the sources and the doxygen documentation page "sse_optimizations" for more details.
35 //
36 // Some functions here are derived from sources in libcvd, released
37 // under LGPL. See http://mi.eng.cam.ac.uk/~er258/cvd/
38 //
39 // ---------------------------------------------------------------------------
40 
41 #include <mrpt/utils/CImage.h>
42 #include <mrpt/utils/SSE_types.h>
43 #include <mrpt/utils/SSE_macros.h>
44 #include "CImage_SSEx.h"
45 
46 /** \addtogroup sse_optimizations
47  * SSE optimized functions
48  * @{
49  */
50 
51 /** Subsample each 2x2 pixel block into 1x1 pixel, taking the first pixel & ignoring the other 3
52  * - <b>Input format:</b> uint8_t, 1 channel
53  * - <b>Output format:</b> uint8_t, 1 channel
54  * - <b>Preconditions:</b> in & out aligned to 16bytes, w = k*16 (w=width in pixels), widthStep=w*1
55  * - <b>Notes:</b>
56  * - <b>Requires:</b> SSE2
57  * - <b>Invoked from:</b> mrpt::utils::CImage::scaleHalf()
58  */
59 void image_SSE2_scale_half_1c8u(const uint8_t* in, uint8_t* out, int w, int h)
60 {
61  EIGEN_ALIGN16 const unsigned long long mask[2] = {0x00FF00FF00FF00FFull, 0x00FF00FF00FF00FFull};
62  const __m128i m = _mm_load_si128((const __m128i*)mask);
63 
64  int sw = w >> 4;
65  int sh = h >> 1;
66 
67  for (int i=0; i<sh; i++)
68  {
69  for (int j=0; j<sw; j++)
70  {
71  const __m128i here_sampled = _mm_and_si128( _mm_load_si128((const __m128i*)in), m);
72  _mm_storel_epi64((__m128i*)out, _mm_packus_epi16(here_sampled,here_sampled));
73  in += 16;
74  out += 8;
75  }
76  in += w;
77  }
78 }
79 
80 
81 /** Average each 2x2 pixels into 1x1 pixel (arithmetic average)
82  * - <b>Input format:</b> uint8_t, 1 channel
83  * - <b>Output format:</b> uint8_t, 1 channel
84  * - <b>Preconditions:</b> in & out aligned to 16bytes, w = k*16 (w=width in pixels), widthStep=w*1
85  * - <b>Notes:</b>
86  * - <b>Requires:</b> SSE2
87  * - <b>Invoked from:</b> mrpt::utils::CImage::scaleHalfSmooth()
88  */
89 void image_SSE2_scale_half_smooth_1c8u(const uint8_t* in, uint8_t* out, int w, int h)
90 {
91  EIGEN_ALIGN16 const unsigned long long mask[2] = {0x00FF00FF00FF00FFull, 0x00FF00FF00FF00FFull};
92  const uint8_t* nextRow = in + w;
93  __m128i m = _mm_load_si128((const __m128i*)mask);
94  int sw = w >> 4;
95  int sh = h >> 1;
96 
97  for (int i=0; i<sh; i++)
98  {
99  for (int j=0; j<sw; j++)
100  {
101  __m128i here = _mm_load_si128((const __m128i*)in);
102  __m128i next = _mm_load_si128((const __m128i*)nextRow);
103  here = _mm_avg_epu8(here,next);
104  next = _mm_and_si128(_mm_srli_si128(here,1), m);
105  here = _mm_and_si128(here,m);
106  here = _mm_avg_epu16(here, next);
107  _mm_storel_epi64((__m128i*)out, _mm_packus_epi16(here,here));
108  in += 16;
109  nextRow += 16;
110  out += 8;
111  }
112 
113  in += w;
114  nextRow += w;
115  }
116 }
117 
118 
119 
120 /** KLT score at a given point of a grayscale image.
121  * - <b>Requires:</b> SSE2
122  * - <b>Invoked from:</b> mrpt::utils::CImage::KLT_response()
123  *
124  * This function is not manually optimized for SSE2 but templatized for different
125  * window sizes such as the compiler can optimize automatically for that size.
126  *
127  * Only for the most common window sizes this templates are instantiated (W=[2-16] and W=32 ),
128  * falling back to
129  * a generic implementation otherwise. The next figure shows the performance (time for
130  * KLT_response() to compute the score for one single pixel) for different window sizes.
131  *
132  * <img src="KLT_response_performance_SSE2.png" >
133  *
134  */
135 float KLT_response_optimized();
136 
137 // TODO:
138 // Sum of absolute differences: Use _mm_sad_epu8
139 
140 /** @} */
141 
142 #endif // end if MRPT_HAS_SSE2



Page generated by Doxygen 1.8.3 for MRPT 0.9.6 SVN: at Fri Feb 15 22:05:02 EST 2013