Main MRPT website > C++ reference
MRPT logo
CImage_SSE3.cpp
Go to the documentation of this file.
1 /* +---------------------------------------------------------------------------+
2  | The Mobile Robot Programming Toolkit (MRPT) C++ library |
3  | |
4  | http://www.mrpt.org/ |
5  | |
6  | Copyright (C) 2005-2012 University of Malaga |
7  | |
8  | This software was written by the Machine Perception and Intelligent |
9  | Robotics Lab, University of Malaga (Spain). |
10  | Contact: Jose-Luis Blanco <jlblanco@ctima.uma.es> |
11  | |
12  | This file is part of the MRPT project. |
13  | |
14  | MRPT is free software: you can redistribute it and/or modify |
15  | it under the terms of the GNU General Public License as published by |
16  | the Free Software Foundation, either version 3 of the License, or |
17  | (at your option) any later version. |
18  | |
19  | MRPT is distributed in the hope that it will be useful, |
20  | but WITHOUT ANY WARRANTY; without even the implied warranty of |
21  | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
22  | GNU General Public License for more details. |
23  | |
24  | You should have received a copy of the GNU General Public License |
25  | along with MRPT. If not, see <http://www.gnu.org/licenses/>. |
26  | |
27  +---------------------------------------------------------------------------+ */
28 
29 #include <mrpt/base.h> // Precompiled headers
30 
31 // ---------------------------------------------------------------------------
32 // This file contains the SSE3/SSSE3 optimized functions for mrpt::utils::CImage
33 // See the sources and the doxygen documentation page "sse_optimizations" for more details.
34 // ---------------------------------------------------------------------------
35 #if MRPT_HAS_SSE3
36 
37 #include <mrpt/utils/CImage.h>
38 #include <mrpt/utils/SSE_types.h>
39 #include <mrpt/utils/SSE_macros.h>
40 #include "CImage_SSEx.h"
41 
42 
43 /** \addtogroup sse_optimizations
44  * SSE optimized functions
45  * @{
46  */
47 
48 /** Subsample each 2x2 pixel block into 1x1 pixel, taking the first pixel & ignoring the other 3
49  * - <b>Input format:</b> uint8_t, 3 channels (RGB or BGR)
50  * - <b>Output format:</b> uint8_t, 3 channels (RGB or BGR)
51  * - <b>Preconditions:</b> in & out aligned to 16bytes, w = k*16 (w=width in pixels), widthStep=w*3
52  * - <b>Notes:</b>
53  * - <b>Requires:</b> SSSE3
54  * - <b>Invoked from:</b> mrpt::utils::CImage::scaleHalf()
55  */
56 void image_SSSE3_scale_half_3c8u(const uint8_t* in, uint8_t* out, int w, int h)
57 {
58  EIGEN_ALIGN16 const unsigned long long mask0[2] = { 0x0D0C080706020100ull, 0x808080808080800Eull }; // Long words are in inverse order due to little endianness
59  EIGEN_ALIGN16 const unsigned long long mask1[2] = { 0x8080808080808080ull, 0x0E0A090804030280ull };
60  EIGEN_ALIGN16 const unsigned long long mask2[2] = { 0x0C0B0A0605040080ull, 0x8080808080808080ull };
61  EIGEN_ALIGN16 const unsigned long long mask3[2] = { 0x808080808080800Full, 0x8080808080808080ull };
62 
63  const __m128i m0 = _mm_load_si128((const __m128i*)mask0);
64  const __m128i m1 = _mm_load_si128((const __m128i*)mask1);
65  const __m128i m2 = _mm_load_si128((const __m128i*)mask2);
66  const __m128i m3 = _mm_load_si128((const __m128i*)mask3);
67 
68  const int sw = w >> 4; // This are the number of 3*16 blocks in each row
69  const int sh = h >> 1;
70 
71  int odd_row = 0;
72 
73  for (int i=0; i<sh; i++)
74  {
75  for (int j=0; j<sw; j++)
76  {
77  // 16-byte blocks #0,#1,#2:
78  __m128i d0 = _mm_load_si128((const __m128i*)in); in += 16;
79  __m128i d1 = _mm_load_si128((const __m128i*)in); in += 16;
80 
81  // First 16 bytes:
82  __m128i shuf0 = _mm_shuffle_epi8(d0,m0);
83  __m128i shuf1 = _mm_shuffle_epi8(d1,m1);
84 
85  __m128i res0 = _mm_or_si128(shuf0,shuf1);
86 
87  if ((odd_row&0x1)!=0)
88  _mm_storeu_si128((__m128i*)out,res0); // unaligned output
89  else _mm_store_si128 ((__m128i*)out,res0); // aligned output
90  out += 16;
91 
92  // Last 8 bytes:
93  __m128i d2 = _mm_load_si128((const __m128i*)in); in += 16;
94 
95  _mm_storel_epi64( // Write lower 8 bytes only
96  (__m128i*)out,
97  _mm_or_si128(_mm_shuffle_epi8(d2,m2),_mm_shuffle_epi8(d1,m3))
98  );
99  odd_row++;
100  out += 8;
101  }
102  in += 3*w;
103  }
104 }
105 
106 
107 // This is the actual function behind both: image_SSSE3_rgb_to_gray_8u() and image_SSSE3_bgr_to_gray_8u():
108 template <bool IS_RGB>
109 void private_image_SSSE3_rgb_or_bgr_to_gray_8u(const uint8_t* in, uint8_t* out, int w, int h)
110 {
111  // Masks: 0 1 2 3 4 5 6 7 8 9 A B C D E F
112  BUILD_128BIT_CONST(mask0, 80,00, 80,03, 80,06, 80,09, 80,0C, 80,0F, 80,80, 80,80) // reds[0-7] from D0
113  BUILD_128BIT_CONST(mask1, 80,80, 80,80, 80,80, 80,80, 80,80, 80,80, 80,02, 80,05) // reds[0-7] from D1
114 
115  BUILD_128BIT_CONST(mask2, 80,01, 80,04, 80,07, 80,0A, 80,0D, 80,80, 80,80, 80,80) // greens[0-7] from D0
116  BUILD_128BIT_CONST(mask3, 80,80, 80,80, 80,80, 80,80, 80,80, 80,00, 80,03, 80,06) // greens[0-7] from D1
117 
118  BUILD_128BIT_CONST(mask4, 80,02, 80,05, 80,08, 80,0B, 80,0E, 80,80, 80,80, 80,80) // blues[0-7] from D0
119  BUILD_128BIT_CONST(mask5, 80,80, 80,80, 80,80, 80,80, 80,80, 80,01, 80,04, 80,07) // blues[0-7] from D1
120 
121 
122  BUILD_128BIT_CONST(mask6, 80,08, 80,0B, 80,0E, 80,80, 80,80, 80,80, 80,80, 80,80) // reds[8-15] from D1
123  BUILD_128BIT_CONST(mask7, 80,80, 80,80, 80,80, 80,01, 80,04, 80,07, 80,0A, 80,0D) // reds[8-15] from D2
124 
125  BUILD_128BIT_CONST(mask8, 80,09, 80,0C, 80,0F, 80,80, 80,80, 80,80, 80,80, 80,80) // greens[8-15] from D1
126  BUILD_128BIT_CONST(mask9, 80,80, 80,80, 80,80, 80,02, 80,05, 80,08, 80,0B, 80,0E) // greens[8-15] from D2
127 
128  BUILD_128BIT_CONST(mask10,80,0A, 80,0D, 80,80, 80,80, 80,80, 80,80, 80,80, 80,80) // blues[8-15] from D1
129  BUILD_128BIT_CONST(mask11,80,80, 80,80, 80,00, 80,03, 80,06, 80,09, 80,0C, 80,0F) // blues[8-15] from D2
130 
131 
132  BUILD_128BIT_CONST(mask_to_low, 01,03,05,07,09,0B,0D,0F, 80,80,80,80,80,80,80,80)
133 
134 
135  // Conversion factors for RGB->Y
136  BUILD_128BIT_CONST(val_red , 00,1D, 00,1D, 00,1D, 00,1D, 00,1D, 00,1D, 00,1D, 00,1D)
137  BUILD_128BIT_CONST(val_green , 00,96, 00,96, 00,96, 00,96, 00,96, 00,96, 00,96, 00,96)
138  BUILD_128BIT_CONST(val_blue , 00,4D, 00,4D, 00,4D, 00,4D, 00,4D, 00,4D, 00,4D, 00,4D)
139 
140  const __m128i m0 = _mm_load_si128( IS_RGB ? (const __m128i*)mask4 : (const __m128i*)mask0);
141  const __m128i m1 = _mm_load_si128( IS_RGB ? (const __m128i*)mask5 : (const __m128i*)mask1);
142  const __m128i m2 = _mm_load_si128((const __m128i*)mask2);
143  const __m128i m3 = _mm_load_si128((const __m128i*)mask3);
144  const __m128i m4 = _mm_load_si128( IS_RGB ? (const __m128i*)mask0 : (const __m128i*)mask4);
145  const __m128i m5 = _mm_load_si128( IS_RGB ? (const __m128i*)mask1 : (const __m128i*)mask5);
146 
147  const __m128i m6 = _mm_load_si128( IS_RGB ? (const __m128i*)mask10 : (const __m128i*)mask6);
148  const __m128i m7 = _mm_load_si128( IS_RGB ? (const __m128i*)mask11 : (const __m128i*)mask7);
149  const __m128i m8 = _mm_load_si128((const __m128i*)mask8);
150  const __m128i m9 = _mm_load_si128((const __m128i*)mask9);
151  const __m128i m10= _mm_load_si128( IS_RGB ? (const __m128i*)mask6 : (const __m128i*)mask10);
152  const __m128i m11= _mm_load_si128( IS_RGB ? (const __m128i*)mask7 : (const __m128i*)mask11);
153 
154  const __m128i mask_low= _mm_load_si128((const __m128i*)mask_to_low);
155 
156  const __m128i VAL_R = _mm_load_si128((const __m128i*)val_red);
157  const __m128i VAL_G = _mm_load_si128((const __m128i*)val_green);
158  const __m128i VAL_B = _mm_load_si128((const __m128i*)val_blue);
159 
160  const int sw = w >> 4; // This are the number of 3*16 blocks in each row
161  const int sh = h ;
162 
163  for (int i=0; i<sh; i++)
164  {
165  for (int j=0; j<sw; j++)
166  {
167  // We process RGB data in blocks of 3 x 16byte blocks:
168  const __m128i d0 = _mm_load_si128((const __m128i*)in); in += 16;
169  const __m128i d1 = _mm_load_si128((const __m128i*)in); in += 16;
170  const __m128i d2 = _mm_load_si128((const __m128i*)in); in += 16;
171 
172  // First 8 bytes of gray levels:
173  {
174  const __m128i BLUES_0_7 = _mm_or_si128(_mm_shuffle_epi8(d0,m0),_mm_shuffle_epi8(d1,m1));
175  const __m128i GREENS_0_7 = _mm_or_si128(_mm_shuffle_epi8(d0,m2),_mm_shuffle_epi8(d1,m3));
176  const __m128i REDS_0_7 = _mm_or_si128(_mm_shuffle_epi8(d0,m4),_mm_shuffle_epi8(d1,m5));
177 
178  // _mm_mulhi_epu16(): Multiplies the 8 unsigned 16-bit integers from a by the 8 unsigned 16-bit integers from b.
179  //r0 := (a0 * b0)[31:16]
180  //r1 := (a1 * b1)[31:16]
181  //...
182  //r7 := (a7 * b7)[31:16]
183  //
184  const __m128i GRAYS_0_7 =
185  _mm_adds_epu16(
186  _mm_mulhi_epu16(REDS_0_7, VAL_R),
187  _mm_adds_epu16(
188  _mm_mulhi_epu16(GREENS_0_7, VAL_G),
189  _mm_mulhi_epu16(BLUES_0_7, VAL_B)
190  ));
191 
192  _mm_storel_epi64((__m128i*)out, _mm_shuffle_epi8(GRAYS_0_7,mask_low));
193  out+=8;
194  }
195 
196  // Second 8 bytes of gray levels:
197  {
198  const __m128i BLUES_8_15 = _mm_or_si128(_mm_shuffle_epi8(d1,m6),_mm_shuffle_epi8(d2,m7));
199  const __m128i GREENS_8_15 = _mm_or_si128(_mm_shuffle_epi8(d1,m8),_mm_shuffle_epi8(d2,m9));
200  const __m128i REDS_8_15 = _mm_or_si128(_mm_shuffle_epi8(d1,m10),_mm_shuffle_epi8(d2,m11));
201 
202  const __m128i GRAYS_8_15 =
203  _mm_adds_epu16(
204  _mm_mulhi_epu16(REDS_8_15, VAL_R),
205  _mm_adds_epu16(
206  _mm_mulhi_epu16(GREENS_8_15, VAL_G),
207  _mm_mulhi_epu16(BLUES_8_15, VAL_B)
208  ));
209 
210  _mm_storel_epi64((__m128i*)out, _mm_shuffle_epi8(GRAYS_8_15,mask_low));
211  out+=8;
212  }
213  }
214  }
215 
216 } // end private_image_SSSE3_rgb_or_bgr_to_gray_8u()
217 
218 
219 /** Convert a RGB image (3cu8) into a GRAYSCALE (1c8u) image, using Y=77*R+150*G+29*B
220  * - <b>Input format:</b> uint8_t, 3 channels (BGR order)
221  * - <b>Output format:</b> uint8_t, 1 channel
222  * - <b>Preconditions:</b> in & out aligned to 16bytes, w = k*16 (w=width in pixels), widthStep=w*3 and w*1
223  * - <b>Notes:</b>
224  * - <b>Requires:</b> SSSE3
225  * - <b>Invoked from:</b> mrpt::utils::CImage::grayscale(), mrpt::utils::CImage::grayscaleInPlace()
226  */
227 void image_SSSE3_bgr_to_gray_8u(const uint8_t* in, uint8_t* out, int w, int h)
228 {
229  private_image_SSSE3_rgb_or_bgr_to_gray_8u<false>(in,out,w,h);
230 }
231 
232 /** Convert a RGB image (3cu8) into a GRAYSCALE (1c8u) image, using Y=77*R+150*G+29*B
233  * - <b>Input format:</b> uint8_t, 3 channels (RGB order)
234  * - <b>Output format:</b> uint8_t, 1 channel
235  * - <b>Preconditions:</b> in & out aligned to 16bytes, w = k*16 (w=width in pixels), widthStep=w*3 and w*1
236  * - <b>Notes:</b>
237  * - <b>Requires:</b> SSSE3
238  * - <b>Invoked from:</b> mrpt::utils::CImage::grayscale(), mrpt::utils::CImage::grayscaleInPlace()
239  */
240 void image_SSSE3_rgb_to_gray_8u(const uint8_t* in, uint8_t* out, int w, int h)
241 {
242  private_image_SSSE3_rgb_or_bgr_to_gray_8u<true>(in,out,w,h);
243 }
244 
245 
246 /** @} */
247 
248 #endif // end of MRPT_HAS_SSE3



Page generated by Doxygen 1.8.3 for MRPT 0.9.6 SVN: at Fri Feb 15 22:05:02 EST 2013