Main MRPT website > C++ reference
MRPT logo
CImage_SSE3.cpp
Go to the documentation of this file.
00001 /* +---------------------------------------------------------------------------+
00002    |          The Mobile Robot Programming Toolkit (MRPT) C++ library          |
00003    |                                                                           |
00004    |                       http://www.mrpt.org/                                |
00005    |                                                                           |
00006    |   Copyright (C) 2005-2011  University of Malaga                           |
00007    |                                                                           |
00008    |    This software was written by the Machine Perception and Intelligent    |
00009    |      Robotics Lab, University of Malaga (Spain).                          |
00010    |    Contact: Jose-Luis Blanco  <jlblanco@ctima.uma.es>                     |
00011    |                                                                           |
00012    |  This file is part of the MRPT project.                                   |
00013    |                                                                           |
00014    |     MRPT is free software: you can redistribute it and/or modify          |
00015    |     it under the terms of the GNU General Public License as published by  |
00016    |     the Free Software Foundation, either version 3 of the License, or     |
00017    |     (at your option) any later version.                                   |
00018    |                                                                           |
00019    |   MRPT is distributed in the hope that it will be useful,                 |
00020    |     but WITHOUT ANY WARRANTY; without even the implied warranty of        |
00021    |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         |
00022    |     GNU General Public License for more details.                          |
00023    |                                                                           |
00024    |     You should have received a copy of the GNU General Public License     |
00025    |     along with MRPT.  If not, see <http://www.gnu.org/licenses/>.         |
00026    |                                                                           |
00027    +---------------------------------------------------------------------------+ */
00028 
00029 #include <mrpt/base.h>  // Precompiled headers
00030 
00031 // ---------------------------------------------------------------------------
00032 //   This file contains the SSE3/SSSE3 optimized functions for mrpt::utils::CImage
00033 //    See the sources and the doxygen documentation page "sse_optimizations" for more details.
00034 // ---------------------------------------------------------------------------
00035 #if MRPT_HAS_SSE3
00036 
00037 #include <mrpt/utils/CImage.h>
00038 #include <mrpt/utils/SSE_types.h>
00039 #include <mrpt/utils/SSE_macros.h>
00040 #include "CImage_SSEx.h"
00041 
00042 
00043 /** \addtogroup sse_optimizations
00044  *  SSE optimized functions
00045  *  @{
00046  */
00047 
00048 /** Subsample each 2x2 pixel block into 1x1 pixel, taking the first pixel & ignoring the other 3
00049   *  - <b>Input format:</b> uint8_t, 3 channels (RGB or BGR)
00050   *  - <b>Output format:</b> uint8_t, 3 channels (RGB or BGR)
00051   *  - <b>Preconditions:</b> in & out aligned to 16bytes, w = k*16 (w=width in pixels), widthStep=w*3
00052   *  - <b>Notes:</b>
00053   *  - <b>Requires:</b> SSSE3
00054   *  - <b>Invoked from:</b> mrpt::utils::CImage::scaleHalf()
00055   */
00056 void image_SSSE3_scale_half_3c8u(const uint8_t* in, uint8_t* out, int w, int h)
00057 {
00058         EIGEN_ALIGN16 const unsigned long long mask0[2] = { 0x0D0C080706020100ull, 0x808080808080800Eull }; // Long words are in inverse order due to little endianness
00059         EIGEN_ALIGN16 const unsigned long long mask1[2] = { 0x8080808080808080ull, 0x0E0A090804030280ull };
00060         EIGEN_ALIGN16 const unsigned long long mask2[2] = { 0x0C0B0A0605040080ull, 0x8080808080808080ull };
00061         EIGEN_ALIGN16 const unsigned long long mask3[2] = { 0x808080808080800Full, 0x8080808080808080ull };
00062 
00063         const __m128i m0 = _mm_load_si128((const __m128i*)mask0);
00064         const __m128i m1 = _mm_load_si128((const __m128i*)mask1);
00065         const __m128i m2 = _mm_load_si128((const __m128i*)mask2);
00066         const __m128i m3 = _mm_load_si128((const __m128i*)mask3);
00067 
00068         const int sw = w >> 4;  // This are the number of 3*16 blocks in each row
00069         const int sh = h >> 1;
00070 
00071         int odd_row = 0; 
00072 
00073         for (int i=0; i<sh; i++)
00074         {
00075                 for (int j=0; j<sw; j++)
00076                 {
00077                         // 16-byte blocks #0,#1,#2:
00078                         __m128i d0 = _mm_load_si128((const __m128i*)in); in += 16;
00079                         __m128i d1 = _mm_load_si128((const __m128i*)in); in += 16;
00080 
00081                         // First 16 bytes:
00082                         __m128i shuf0 = _mm_shuffle_epi8(d0,m0);
00083                         __m128i shuf1 = _mm_shuffle_epi8(d1,m1);
00084 
00085                         __m128i res0 = _mm_or_si128(shuf0,shuf1);
00086 
00087                         if ((odd_row&0x1)!=0)
00088                              _mm_storeu_si128((__m128i*)out,res0);  // unaligned output
00089                         else _mm_store_si128 ((__m128i*)out,res0);  // aligned output
00090                         out += 16;
00091 
00092                         // Last 8 bytes:
00093                         __m128i d2 = _mm_load_si128((const __m128i*)in); in += 16;
00094 
00095                         _mm_storel_epi64(       // Write lower 8 bytes only
00096                                 (__m128i*)out,
00097                                 _mm_or_si128(_mm_shuffle_epi8(d2,m2),_mm_shuffle_epi8(d1,m3))
00098                                 );
00099                         odd_row++;
00100                         out += 8;
00101                 }
00102                 in += 3*w;
00103         }
00104 }
00105 
00106 
00107 // This is the actual function behind both: image_SSSE3_rgb_to_gray_8u() and image_SSSE3_bgr_to_gray_8u():
00108 template <bool IS_RGB>
00109 void private_image_SSSE3_rgb_or_bgr_to_gray_8u(const uint8_t* in, uint8_t* out, int w, int h)
00110 {
00111         // Masks:                 0  1   2  3   4  5   6  7   8 9    A  B   C  D  E  F
00112         BUILD_128BIT_CONST(mask0, 80,00, 80,03, 80,06, 80,09, 80,0C, 80,0F, 80,80, 80,80) // reds[0-7] from D0
00113         BUILD_128BIT_CONST(mask1, 80,80, 80,80, 80,80, 80,80, 80,80, 80,80, 80,02, 80,05) // reds[0-7] from D1
00114 
00115         BUILD_128BIT_CONST(mask2, 80,01, 80,04, 80,07, 80,0A, 80,0D, 80,80, 80,80, 80,80) // greens[0-7] from D0
00116         BUILD_128BIT_CONST(mask3, 80,80, 80,80, 80,80, 80,80, 80,80, 80,00, 80,03, 80,06) // greens[0-7] from D1
00117 
00118         BUILD_128BIT_CONST(mask4, 80,02, 80,05, 80,08, 80,0B, 80,0E, 80,80, 80,80, 80,80) // blues[0-7] from D0
00119         BUILD_128BIT_CONST(mask5, 80,80, 80,80, 80,80, 80,80, 80,80, 80,01, 80,04, 80,07) // blues[0-7] from D1
00120 
00121 
00122         BUILD_128BIT_CONST(mask6, 80,08, 80,0B, 80,0E, 80,80, 80,80, 80,80, 80,80, 80,80) // reds[8-15] from D1
00123         BUILD_128BIT_CONST(mask7, 80,80, 80,80, 80,80, 80,01, 80,04, 80,07, 80,0A, 80,0D) // reds[8-15] from D2
00124 
00125         BUILD_128BIT_CONST(mask8, 80,09, 80,0C, 80,0F, 80,80, 80,80, 80,80, 80,80, 80,80) // greens[8-15] from D1
00126         BUILD_128BIT_CONST(mask9, 80,80, 80,80, 80,80, 80,02, 80,05, 80,08, 80,0B, 80,0E) // greens[8-15] from D2
00127 
00128         BUILD_128BIT_CONST(mask10,80,0A, 80,0D, 80,80, 80,80, 80,80, 80,80, 80,80, 80,80) // blues[8-15] from D1
00129         BUILD_128BIT_CONST(mask11,80,80, 80,80, 80,00, 80,03, 80,06, 80,09, 80,0C, 80,0F) // blues[8-15] from D2
00130 
00131 
00132         BUILD_128BIT_CONST(mask_to_low, 01,03,05,07,09,0B,0D,0F, 80,80,80,80,80,80,80,80)
00133 
00134 
00135         // Conversion factors for RGB->Y
00136         BUILD_128BIT_CONST(val_red   , 00,1D, 00,1D, 00,1D, 00,1D, 00,1D, 00,1D, 00,1D, 00,1D)
00137         BUILD_128BIT_CONST(val_green , 00,96, 00,96, 00,96, 00,96, 00,96, 00,96, 00,96, 00,96)
00138         BUILD_128BIT_CONST(val_blue  , 00,4D, 00,4D, 00,4D, 00,4D, 00,4D, 00,4D, 00,4D, 00,4D)
00139 
00140         const __m128i m0 = _mm_load_si128( IS_RGB ? (const __m128i*)mask4 : (const __m128i*)mask0);
00141         const __m128i m1 = _mm_load_si128( IS_RGB ? (const __m128i*)mask5 : (const __m128i*)mask1);
00142         const __m128i m2 = _mm_load_si128((const __m128i*)mask2);
00143         const __m128i m3 = _mm_load_si128((const __m128i*)mask3);
00144         const __m128i m4 = _mm_load_si128( IS_RGB ? (const __m128i*)mask0 : (const __m128i*)mask4);
00145         const __m128i m5 = _mm_load_si128( IS_RGB ? (const __m128i*)mask1 : (const __m128i*)mask5);
00146 
00147         const __m128i m6 = _mm_load_si128( IS_RGB ? (const __m128i*)mask10 : (const __m128i*)mask6);
00148         const __m128i m7 = _mm_load_si128( IS_RGB ? (const __m128i*)mask11 : (const __m128i*)mask7);
00149         const __m128i m8 = _mm_load_si128((const __m128i*)mask8);
00150         const __m128i m9 = _mm_load_si128((const __m128i*)mask9);
00151         const __m128i m10= _mm_load_si128( IS_RGB ? (const __m128i*)mask6 : (const __m128i*)mask10);
00152         const __m128i m11= _mm_load_si128( IS_RGB ? (const __m128i*)mask7 : (const __m128i*)mask11);
00153 
00154         const __m128i mask_low= _mm_load_si128((const __m128i*)mask_to_low);
00155 
00156         const __m128i VAL_R = _mm_load_si128((const __m128i*)val_red);
00157         const __m128i VAL_G = _mm_load_si128((const __m128i*)val_green);
00158         const __m128i VAL_B = _mm_load_si128((const __m128i*)val_blue);
00159 
00160         const int sw = w >> 4;  // This are the number of 3*16 blocks in each row
00161         const int sh = h ;
00162 
00163         for (int i=0; i<sh; i++)
00164         {
00165                 for (int j=0; j<sw; j++)
00166                 {
00167                         // We process RGB data in blocks of 3 x 16byte blocks:
00168                         const __m128i d0 = _mm_load_si128((const __m128i*)in); in += 16;
00169                         const __m128i d1 = _mm_load_si128((const __m128i*)in); in += 16;
00170                         const __m128i d2 = _mm_load_si128((const __m128i*)in); in += 16;
00171 
00172                         // First 8 bytes of gray levels:
00173                         {
00174                                 const __m128i BLUES_0_7  = _mm_or_si128(_mm_shuffle_epi8(d0,m0),_mm_shuffle_epi8(d1,m1));
00175                                 const __m128i GREENS_0_7 = _mm_or_si128(_mm_shuffle_epi8(d0,m2),_mm_shuffle_epi8(d1,m3));
00176                                 const __m128i REDS_0_7   = _mm_or_si128(_mm_shuffle_epi8(d0,m4),_mm_shuffle_epi8(d1,m5));
00177 
00178                                 // _mm_mulhi_epu16(): Multiplies the 8 unsigned 16-bit integers from a by the 8 unsigned 16-bit integers from b.
00179                                 //r0 := (a0 * b0)[31:16]
00180                                 //r1 := (a1 * b1)[31:16]
00181                                 //...
00182                                 //r7 := (a7 * b7)[31:16]
00183                                 //
00184                                 const __m128i GRAYS_0_7 =
00185                                         _mm_adds_epu16(
00186                                                 _mm_mulhi_epu16(REDS_0_7,   VAL_R),
00187                                         _mm_adds_epu16(
00188                                                 _mm_mulhi_epu16(GREENS_0_7, VAL_G),
00189                                                 _mm_mulhi_epu16(BLUES_0_7,  VAL_B)
00190                                         ));
00191 
00192                                 _mm_storel_epi64((__m128i*)out, _mm_shuffle_epi8(GRAYS_0_7,mask_low));
00193                                 out+=8;
00194                         }
00195 
00196                         // Second 8 bytes of gray levels:
00197                         {
00198                                 const __m128i BLUES_8_15  = _mm_or_si128(_mm_shuffle_epi8(d1,m6),_mm_shuffle_epi8(d2,m7));
00199                                 const __m128i GREENS_8_15 = _mm_or_si128(_mm_shuffle_epi8(d1,m8),_mm_shuffle_epi8(d2,m9));
00200                                 const __m128i REDS_8_15   = _mm_or_si128(_mm_shuffle_epi8(d1,m10),_mm_shuffle_epi8(d2,m11));
00201 
00202                                 const __m128i GRAYS_8_15 =
00203                                         _mm_adds_epu16(
00204                                                 _mm_mulhi_epu16(REDS_8_15,   VAL_R),
00205                                         _mm_adds_epu16(
00206                                                 _mm_mulhi_epu16(GREENS_8_15, VAL_G),
00207                                                 _mm_mulhi_epu16(BLUES_8_15,  VAL_B)
00208                                         ));
00209 
00210                                 _mm_storel_epi64((__m128i*)out, _mm_shuffle_epi8(GRAYS_8_15,mask_low));
00211                                 out+=8;
00212                         }
00213                 }
00214         }
00215 
00216 } // end private_image_SSSE3_rgb_or_bgr_to_gray_8u()
00217 
00218 
00219 /** Convert a RGB image (3cu8) into a GRAYSCALE (1c8u) image, using Y=77*R+150*G+29*B
00220   *  - <b>Input format:</b> uint8_t, 3 channels (BGR order)
00221   *  - <b>Output format:</b> uint8_t, 1 channel
00222   *  - <b>Preconditions:</b> in & out aligned to 16bytes, w = k*16 (w=width in pixels), widthStep=w*3 and w*1
00223   *  - <b>Notes:</b>
00224   *  - <b>Requires:</b> SSSE3
00225   *  - <b>Invoked from:</b> mrpt::utils::CImage::grayscale(), mrpt::utils::CImage::grayscaleInPlace()
00226   */
00227 void image_SSSE3_bgr_to_gray_8u(const uint8_t* in, uint8_t* out, int w, int h)
00228 {
00229         private_image_SSSE3_rgb_or_bgr_to_gray_8u<false>(in,out,w,h);
00230 }
00231 
00232 /** Convert a RGB image (3cu8) into a GRAYSCALE (1c8u) image, using Y=77*R+150*G+29*B
00233   *  - <b>Input format:</b> uint8_t, 3 channels (RGB order)
00234   *  - <b>Output format:</b> uint8_t, 1 channel
00235   *  - <b>Preconditions:</b> in & out aligned to 16bytes, w = k*16 (w=width in pixels), widthStep=w*3 and w*1
00236   *  - <b>Notes:</b>
00237   *  - <b>Requires:</b> SSSE3
00238   *  - <b>Invoked from:</b> mrpt::utils::CImage::grayscale(), mrpt::utils::CImage::grayscaleInPlace()
00239   */
00240 void image_SSSE3_rgb_to_gray_8u(const uint8_t* in, uint8_t* out, int w, int h)
00241 {
00242         private_image_SSSE3_rgb_or_bgr_to_gray_8u<true>(in,out,w,h);
00243 }
00244 
00245 
00246 /**  @} */
00247 
00248 #endif // end of MRPT_HAS_SSE3



Page generated by Doxygen 1.7.5 for MRPT 0.9.5 SVN: at Thu Oct 13 21:25:36 UTC 2011