Main MRPT website > C++ reference
MRPT logo
CImage_SSE2.cpp
Go to the documentation of this file.
00001 /* +---------------------------------------------------------------------------+
00002    |          The Mobile Robot Programming Toolkit (MRPT) C++ library          |
00003    |                                                                           |
00004    |                       http://www.mrpt.org/                                |
00005    |                                                                           |
00006    |   Copyright (C) 2005-2011  University of Malaga                           |
00007    |                                                                           |
00008    |    This software was written by the Machine Perception and Intelligent    |
00009    |      Robotics Lab, University of Malaga (Spain).                          |
00010    |    Contact: Jose-Luis Blanco  <jlblanco@ctima.uma.es>                     |
00011    |                                                                           |
00012    |  This file is part of the MRPT project.                                   |
00013    |                                                                           |
00014    |     MRPT is free software: you can redistribute it and/or modify          |
00015    |     it under the terms of the GNU General Public License as published by  |
00016    |     the Free Software Foundation, either version 3 of the License, or     |
00017    |     (at your option) any later version.                                   |
00018    |                                                                           |
00019    |   MRPT is distributed in the hope that it will be useful,                 |
00020    |     but WITHOUT ANY WARRANTY; without even the implied warranty of        |
00021    |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         |
00022    |     GNU General Public License for more details.                          |
00023    |                                                                           |
00024    |     You should have received a copy of the GNU General Public License     |
00025    |     along with MRPT.  If not, see <http://www.gnu.org/licenses/>.         |
00026    |                                                                           |
00027    +---------------------------------------------------------------------------+ */
00028 
00029 #include <mrpt/base.h>  // Precompiled headers
00030 
00031 #if MRPT_HAS_SSE2
00032 // ---------------------------------------------------------------------------
00033 //   This file contains the SSE2 optimized functions for mrpt::utils::CImage
00034 //    See the sources and the doxygen documentation page "sse_optimizations" for more details.
00035 //
00036 //  Some functions here are derived from sources in libcvd, released
00037 //   under LGPL. See http://mi.eng.cam.ac.uk/~er258/cvd/
00038 //
00039 // ---------------------------------------------------------------------------
00040 
00041 #include <mrpt/utils/CImage.h>
00042 #include <mrpt/utils/SSE_types.h>
00043 #include <mrpt/utils/SSE_macros.h>
00044 #include "CImage_SSEx.h"
00045 
00046 /** \addtogroup sse_optimizations
00047  *  SSE optimized functions
00048  *  @{
00049  */
00050 
00051 /** Subsample each 2x2 pixel block into 1x1 pixel, taking the first pixel & ignoring the other 3
00052   *  - <b>Input format:</b> uint8_t, 1 channel
00053   *  - <b>Output format:</b> uint8_t, 1 channel
00054   *  - <b>Preconditions:</b> in & out aligned to 16bytes, w = k*16 (w=width in pixels), widthStep=w*1
00055   *  - <b>Notes:</b>
00056   *  - <b>Requires:</b> SSE2
00057   *  - <b>Invoked from:</b> mrpt::utils::CImage::scaleHalf()
00058   */
00059 void image_SSE2_scale_half_1c8u(const uint8_t* in, uint8_t* out, int w, int h)
00060 {
00061         EIGEN_ALIGN16 const unsigned long long mask[2] = {0x00FF00FF00FF00FFull, 0x00FF00FF00FF00FFull};
00062         const __m128i m = _mm_load_si128((const __m128i*)mask);
00063 
00064         int sw = w >> 4;
00065         int sh = h >> 1;
00066 
00067         for (int i=0; i<sh; i++)
00068         {
00069                 for (int j=0; j<sw; j++)
00070                 {
00071                         const __m128i here_sampled = _mm_and_si128( _mm_load_si128((const __m128i*)in), m);
00072                         _mm_storel_epi64((__m128i*)out, _mm_packus_epi16(here_sampled,here_sampled));
00073                         in += 16;
00074                         out += 8;
00075                 }
00076                 in += w;
00077         }
00078 }
00079 
00080 
00081 /** Average each 2x2 pixels into 1x1 pixel (arithmetic average)
00082   *  - <b>Input format:</b> uint8_t, 1 channel
00083   *  - <b>Output format:</b> uint8_t, 1 channel
00084   *  - <b>Preconditions:</b> in & out aligned to 16bytes, w = k*16 (w=width in pixels), widthStep=w*1
00085   *  - <b>Notes:</b>
00086   *  - <b>Requires:</b> SSE2
00087   *  - <b>Invoked from:</b> mrpt::utils::CImage::scaleHalfSmooth()
00088   */
00089 void image_SSE2_scale_half_smooth_1c8u(const uint8_t* in, uint8_t* out, int w, int h)
00090 {
00091         EIGEN_ALIGN16 const unsigned long long mask[2] = {0x00FF00FF00FF00FFull, 0x00FF00FF00FF00FFull};
00092         const uint8_t* nextRow = in + w;
00093         __m128i m = _mm_load_si128((const __m128i*)mask);
00094         int sw = w >> 4;
00095         int sh = h >> 1;
00096 
00097         for (int i=0; i<sh; i++)
00098         {
00099                 for (int j=0; j<sw; j++)
00100                 {
00101                         __m128i here = _mm_load_si128((const __m128i*)in);
00102                         __m128i next = _mm_load_si128((const __m128i*)nextRow);
00103                         here = _mm_avg_epu8(here,next);
00104                         next = _mm_and_si128(_mm_srli_si128(here,1), m);
00105                         here = _mm_and_si128(here,m);
00106                         here = _mm_avg_epu16(here, next);
00107                         _mm_storel_epi64((__m128i*)out, _mm_packus_epi16(here,here));
00108                         in += 16;
00109                         nextRow += 16;
00110                         out += 8;
00111                 }
00112 
00113                 in += w;
00114                 nextRow += w;
00115         }
00116 }
00117 
00118 
00119 
00120 /** KLT score at a given point of a grayscale image.
00121   *  - <b>Requires:</b> SSE2
00122   *  - <b>Invoked from:</b> mrpt::utils::CImage::KLT_response()
00123   *
00124   *  This function is not manually optimized for SSE2 but templatized for different
00125   *   window sizes such as the compiler can optimize automatically for that size.
00126   *
00127   *  Only for the most common window sizes this templates are instantiated (W=[2-16] and W=32 ),
00128   *   falling back to
00129   *   a generic implementation otherwise. The next figure shows the performance (time for
00130   *   KLT_response() to compute the score for one single pixel) for different window sizes.
00131   *
00132   *  <img src="KLT_response_performance_SSE2.png" >
00133   *
00134   */
00135 float KLT_response_optimized();
00136 
00137 // TODO:
00138 // Sum of absolute differences: Use  _mm_sad_epu8
00139 
00140 /**  @} */
00141 
00142 #endif // end if MRPT_HAS_SSE2



Page generated by Doxygen 1.7.5 for MRPT 0.9.5 SVN: at Thu Oct 13 21:25:36 UTC 2011