40 #include "CImage_SSEx.h"
58 EIGEN_ALIGN16
const unsigned long long mask0[2] = { 0x0D0C080706020100ull, 0x808080808080800Eull };
59 EIGEN_ALIGN16
const unsigned long long mask1[2] = { 0x8080808080808080ull, 0x0E0A090804030280ull };
60 EIGEN_ALIGN16
const unsigned long long mask2[2] = { 0x0C0B0A0605040080ull, 0x8080808080808080ull };
61 EIGEN_ALIGN16
const unsigned long long mask3[2] = { 0x808080808080800Full, 0x8080808080808080ull };
63 const __m128i m0 = _mm_load_si128((
const __m128i*)mask0);
64 const __m128i m1 = _mm_load_si128((
const __m128i*)mask1);
65 const __m128i m2 = _mm_load_si128((
const __m128i*)mask2);
66 const __m128i m3 = _mm_load_si128((
const __m128i*)mask3);
68 const int sw = w >> 4;
69 const int sh = h >> 1;
73 for (
int i=0; i<sh; i++)
75 for (
int j=0; j<sw; j++)
78 __m128i d0 = _mm_load_si128((
const __m128i*)in); in += 16;
79 __m128i d1 = _mm_load_si128((
const __m128i*)in); in += 16;
82 __m128i shuf0 = _mm_shuffle_epi8(d0,m0);
83 __m128i shuf1 = _mm_shuffle_epi8(d1,m1);
85 __m128i res0 = _mm_or_si128(shuf0,shuf1);
88 _mm_storeu_si128((__m128i*)out,res0);
89 else _mm_store_si128 ((__m128i*)out,res0);
93 __m128i d2 = _mm_load_si128((
const __m128i*)in); in += 16;
97 _mm_or_si128(_mm_shuffle_epi8(d2,m2),_mm_shuffle_epi8(d1,m3))
108 template <
bool IS_RGB>
112 BUILD_128BIT_CONST(mask0, 80,00, 80,03, 80,06, 80,09, 80,0C, 80,0F, 80,80, 80,80)
113 BUILD_128BIT_CONST(mask1, 80,80, 80,80, 80,80, 80,80, 80,80, 80,80, 80,02, 80,05)
115 BUILD_128BIT_CONST(mask2, 80,01, 80,04, 80,07, 80,0A, 80,0D, 80,80, 80,80, 80,80)
116 BUILD_128BIT_CONST(mask3, 80,80, 80,80, 80,80, 80,80, 80,80, 80,00, 80,03, 80,06)
118 BUILD_128BIT_CONST(mask4, 80,02, 80,05, 80,08, 80,0B, 80,0E, 80,80, 80,80, 80,80)
119 BUILD_128BIT_CONST(mask5, 80,80, 80,80, 80,80, 80,80, 80,80, 80,01, 80,04, 80,07)
122 BUILD_128BIT_CONST(mask6, 80,08, 80,0B, 80,0E, 80,80, 80,80, 80,80, 80,80, 80,80)
123 BUILD_128BIT_CONST(mask7, 80,80, 80,80, 80,80, 80,01, 80,04, 80,07, 80,0A, 80,0D)
125 BUILD_128BIT_CONST(mask8, 80,09, 80,0C, 80,0F, 80,80, 80,80, 80,80, 80,80, 80,80)
126 BUILD_128BIT_CONST(mask9, 80,80, 80,80, 80,80, 80,02, 80,05, 80,08, 80,0B, 80,0E)
128 BUILD_128BIT_CONST(mask10,80,0A, 80,0D, 80,80, 80,80, 80,80, 80,80, 80,80, 80,80)
129 BUILD_128BIT_CONST(mask11,80,80, 80,80, 80,00, 80,03, 80,06, 80,09, 80,0C, 80,0F)
132 BUILD_128BIT_CONST(mask_to_low, 01,03,05,07,09,0B,0D,0F, 80,80,80,80,80,80,80,80)
136 BUILD_128BIT_CONST(val_red , 00,1D, 00,1D, 00,1D, 00,1D, 00,1D, 00,1D, 00,1D, 00,1D)
137 BUILD_128BIT_CONST(val_green , 00,96, 00,96, 00,96, 00,96, 00,96, 00,96, 00,96, 00,96)
138 BUILD_128BIT_CONST(val_blue , 00,4D, 00,4D, 00,4D, 00,4D, 00,4D, 00,4D, 00,4D, 00,4D)
140 const __m128i m0 = _mm_load_si128( IS_RGB ? (
const __m128i*)mask4 : (
const __m128i*)mask0);
141 const __m128i m1 = _mm_load_si128( IS_RGB ? (
const __m128i*)mask5 : (
const __m128i*)mask1);
142 const __m128i m2 = _mm_load_si128((
const __m128i*)mask2);
143 const __m128i m3 = _mm_load_si128((
const __m128i*)mask3);
144 const __m128i m4 = _mm_load_si128( IS_RGB ? (
const __m128i*)mask0 : (
const __m128i*)mask4);
145 const __m128i m5 = _mm_load_si128( IS_RGB ? (
const __m128i*)mask1 : (
const __m128i*)mask5);
147 const __m128i m6 = _mm_load_si128( IS_RGB ? (
const __m128i*)mask10 : (
const __m128i*)mask6);
148 const __m128i m7 = _mm_load_si128( IS_RGB ? (
const __m128i*)mask11 : (
const __m128i*)mask7);
149 const __m128i m8 = _mm_load_si128((
const __m128i*)mask8);
150 const __m128i m9 = _mm_load_si128((
const __m128i*)mask9);
151 const __m128i m10= _mm_load_si128( IS_RGB ? (
const __m128i*)mask6 : (
const __m128i*)mask10);
152 const __m128i m11= _mm_load_si128( IS_RGB ? (
const __m128i*)mask7 : (
const __m128i*)mask11);
154 const __m128i mask_low= _mm_load_si128((
const __m128i*)mask_to_low);
156 const __m128i VAL_R = _mm_load_si128((
const __m128i*)val_red);
157 const __m128i VAL_G = _mm_load_si128((
const __m128i*)val_green);
158 const __m128i VAL_B = _mm_load_si128((
const __m128i*)val_blue);
160 const int sw = w >> 4;
163 for (
int i=0; i<sh; i++)
165 for (
int j=0; j<sw; j++)
168 const __m128i d0 = _mm_load_si128((
const __m128i*)in); in += 16;
169 const __m128i d1 = _mm_load_si128((
const __m128i*)in); in += 16;
170 const __m128i d2 = _mm_load_si128((
const __m128i*)in); in += 16;
174 const __m128i BLUES_0_7 = _mm_or_si128(_mm_shuffle_epi8(d0,m0),_mm_shuffle_epi8(d1,m1));
175 const __m128i GREENS_0_7 = _mm_or_si128(_mm_shuffle_epi8(d0,m2),_mm_shuffle_epi8(d1,m3));
176 const __m128i REDS_0_7 = _mm_or_si128(_mm_shuffle_epi8(d0,m4),_mm_shuffle_epi8(d1,m5));
184 const __m128i GRAYS_0_7 =
186 _mm_mulhi_epu16(REDS_0_7, VAL_R),
188 _mm_mulhi_epu16(GREENS_0_7, VAL_G),
189 _mm_mulhi_epu16(BLUES_0_7, VAL_B)
192 _mm_storel_epi64((__m128i*)out, _mm_shuffle_epi8(GRAYS_0_7,mask_low));
198 const __m128i BLUES_8_15 = _mm_or_si128(_mm_shuffle_epi8(d1,m6),_mm_shuffle_epi8(d2,m7));
199 const __m128i GREENS_8_15 = _mm_or_si128(_mm_shuffle_epi8(d1,m8),_mm_shuffle_epi8(d2,m9));
200 const __m128i REDS_8_15 = _mm_or_si128(_mm_shuffle_epi8(d1,m10),_mm_shuffle_epi8(d2,m11));
202 const __m128i GRAYS_8_15 =
204 _mm_mulhi_epu16(REDS_8_15, VAL_R),
206 _mm_mulhi_epu16(GREENS_8_15, VAL_G),
207 _mm_mulhi_epu16(BLUES_8_15, VAL_B)
210 _mm_storel_epi64((__m128i*)out, _mm_shuffle_epi8(GRAYS_8_15,mask_low));
229 private_image_SSSE3_rgb_or_bgr_to_gray_8u<false>(in,out,w,h);
242 private_image_SSSE3_rgb_or_bgr_to_gray_8u<true>(in,out,w,h);
248 #endif // end of MRPT_HAS_SSE3