00001 /* +---------------------------------------------------------------------------+ 00002 | The Mobile Robot Programming Toolkit (MRPT) C++ library | 00003 | | 00004 | http://www.mrpt.org/ | 00005 | | 00006 | Copyright (C) 2005-2011 University of Malaga | 00007 | | 00008 | This software was written by the Machine Perception and Intelligent | 00009 | Robotics Lab, University of Malaga (Spain). | 00010 | Contact: Jose-Luis Blanco <jlblanco@ctima.uma.es> | 00011 | | 00012 | This file is part of the MRPT project. | 00013 | | 00014 | MRPT is free software: you can redistribute it and/or modify | 00015 | it under the terms of the GNU General Public License as published by | 00016 | the Free Software Foundation, either version 3 of the License, or | 00017 | (at your option) any later version. | 00018 | | 00019 | MRPT is distributed in the hope that it will be useful, | 00020 | but WITHOUT ANY WARRANTY; without even the implied warranty of | 00021 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 00022 | GNU General Public License for more details. | 00023 | | 00024 | You should have received a copy of the GNU General Public License | 00025 | along with MRPT. If not, see <http://www.gnu.org/licenses/>. | 00026 | | 00027 +---------------------------------------------------------------------------+ */ 00028 00029 #include <mrpt/base.h> // Precompiled headers 00030 00031 #if MRPT_HAS_SSE2 00032 // --------------------------------------------------------------------------- 00033 // This file contains the SSE2 optimized functions for mrpt::utils::CImage 00034 // See the sources and the doxygen documentation page "sse_optimizations" for more details. 00035 // 00036 // Some functions here are derived from sources in libcvd, released 00037 // under LGPL. See http://mi.eng.cam.ac.uk/~er258/cvd/ 00038 // 00039 // --------------------------------------------------------------------------- 00040 00041 #include <mrpt/utils/CImage.h> 00042 #include <mrpt/utils/SSE_types.h> 00043 #include <mrpt/utils/SSE_macros.h> 00044 #include "CImage_SSEx.h" 00045 00046 /** \addtogroup sse_optimizations 00047 * SSE optimized functions 00048 * @{ 00049 */ 00050 00051 /** Subsample each 2x2 pixel block into 1x1 pixel, taking the first pixel & ignoring the other 3 00052 * - <b>Input format:</b> uint8_t, 1 channel 00053 * - <b>Output format:</b> uint8_t, 1 channel 00054 * - <b>Preconditions:</b> in & out aligned to 16bytes, w = k*16 (w=width in pixels), widthStep=w*1 00055 * - <b>Notes:</b> 00056 * - <b>Requires:</b> SSE2 00057 * - <b>Invoked from:</b> mrpt::utils::CImage::scaleHalf() 00058 */ 00059 void image_SSE2_scale_half_1c8u(const uint8_t* in, uint8_t* out, int w, int h) 00060 { 00061 EIGEN_ALIGN16 const unsigned long long mask[2] = {0x00FF00FF00FF00FFull, 0x00FF00FF00FF00FFull}; 00062 const __m128i m = _mm_load_si128((const __m128i*)mask); 00063 00064 int sw = w >> 4; 00065 int sh = h >> 1; 00066 00067 for (int i=0; i<sh; i++) 00068 { 00069 for (int j=0; j<sw; j++) 00070 { 00071 const __m128i here_sampled = _mm_and_si128( _mm_load_si128((const __m128i*)in), m); 00072 _mm_storel_epi64((__m128i*)out, _mm_packus_epi16(here_sampled,here_sampled)); 00073 in += 16; 00074 out += 8; 00075 } 00076 in += w; 00077 } 00078 } 00079 00080 00081 /** Average each 2x2 pixels into 1x1 pixel (arithmetic average) 00082 * - <b>Input format:</b> uint8_t, 1 channel 00083 * - <b>Output format:</b> uint8_t, 1 channel 00084 * - <b>Preconditions:</b> in & out aligned to 16bytes, w = k*16 (w=width in pixels), widthStep=w*1 00085 * - <b>Notes:</b> 00086 * - <b>Requires:</b> SSE2 00087 * - <b>Invoked from:</b> mrpt::utils::CImage::scaleHalfSmooth() 00088 */ 00089 void image_SSE2_scale_half_smooth_1c8u(const uint8_t* in, uint8_t* out, int w, int h) 00090 { 00091 EIGEN_ALIGN16 const unsigned long long mask[2] = {0x00FF00FF00FF00FFull, 0x00FF00FF00FF00FFull}; 00092 const uint8_t* nextRow = in + w; 00093 __m128i m = _mm_load_si128((const __m128i*)mask); 00094 int sw = w >> 4; 00095 int sh = h >> 1; 00096 00097 for (int i=0; i<sh; i++) 00098 { 00099 for (int j=0; j<sw; j++) 00100 { 00101 __m128i here = _mm_load_si128((const __m128i*)in); 00102 __m128i next = _mm_load_si128((const __m128i*)nextRow); 00103 here = _mm_avg_epu8(here,next); 00104 next = _mm_and_si128(_mm_srli_si128(here,1), m); 00105 here = _mm_and_si128(here,m); 00106 here = _mm_avg_epu16(here, next); 00107 _mm_storel_epi64((__m128i*)out, _mm_packus_epi16(here,here)); 00108 in += 16; 00109 nextRow += 16; 00110 out += 8; 00111 } 00112 00113 in += w; 00114 nextRow += w; 00115 } 00116 } 00117 00118 00119 00120 /** KLT score at a given point of a grayscale image. 00121 * - <b>Requires:</b> SSE2 00122 * - <b>Invoked from:</b> mrpt::utils::CImage::KLT_response() 00123 * 00124 * This function is not manually optimized for SSE2 but templatized for different 00125 * window sizes such as the compiler can optimize automatically for that size. 00126 * 00127 * Only for the most common window sizes this templates are instantiated (W=[2-16] and W=32 ), 00128 * falling back to 00129 * a generic implementation otherwise. The next figure shows the performance (time for 00130 * KLT_response() to compute the score for one single pixel) for different window sizes. 00131 * 00132 * <img src="KLT_response_performance_SSE2.png" > 00133 * 00134 */ 00135 float KLT_response_optimized(); 00136 00137 // TODO: 00138 // Sum of absolute differences: Use _mm_sad_epu8 00139 00140 /** @} */ 00141 00142 #endif // end if MRPT_HAS_SSE2
| Page generated by Doxygen 1.7.5 for MRPT 0.9.5 SVN: at Thu Oct 13 21:25:36 UTC 2011 |