Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
Fork of gr-peach-opencv-project-sd-card by
sse_utils.hpp
00001 /*M/////////////////////////////////////////////////////////////////////////////////////// 00002 // 00003 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 00004 // 00005 // By downloading, copying, installing or using the software you agree to this license. 00006 // If you do not agree to this license, do not download, install, 00007 // copy or use the software. 00008 // 00009 // 00010 // License Agreement 00011 // For Open Source Computer Vision Library 00012 // 00013 // Copyright (C) 2015, Itseez Inc., all rights reserved. 00014 // Third party copyrights are property of their respective owners. 00015 // 00016 // Redistribution and use in source and binary forms, with or without modification, 00017 // are permitted provided that the following conditions are met: 00018 // 00019 // * Redistribution's of source code must retain the above copyright notice, 00020 // this list of conditions and the following disclaimer. 00021 // 00022 // * Redistribution's in binary form must reproduce the above copyright notice, 00023 // this list of conditions and the following disclaimer in the documentation 00024 // and/or other materials provided with the distribution. 00025 // 00026 // * The name of the copyright holders may not be used to endorse or promote products 00027 // derived from this software without specific prior written permission. 00028 // 00029 // This software is provided by the copyright holders and contributors "as is" and 00030 // any express or implied warranties, including, but not limited to, the implied 00031 // warranties of merchantability and fitness for a particular purpose are disclaimed. 00032 // In no event shall the Intel Corporation or contributors be liable for any direct, 00033 // indirect, incidental, special, exemplary, or consequential damages 00034 // (including, but not limited to, procurement of substitute goods or services; 00035 // loss of use, data, or profits; or business interruption) however caused 00036 // and on any theory of liability, whether in contract, strict liability, 00037 // or tort (including negligence or otherwise) arising in any way out of 00038 // the use of this software, even if advised of the possibility of such damage. 00039 // 00040 //M*/ 00041 00042 #ifndef __OPENCV_CORE_SSE_UTILS_HPP__ 00043 #define __OPENCV_CORE_SSE_UTILS_HPP__ 00044 00045 #ifndef __cplusplus 00046 # error sse_utils.hpp header must be compiled as C++ 00047 #endif 00048 00049 #include "opencv2/core/cvdef.h" 00050 00051 //! @addtogroup core_utils_sse 00052 //! @{ 00053 00054 #if CV_SSE2 00055 00056 inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1) 00057 { 00058 __m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_g0); 00059 __m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_g0); 00060 __m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_g1); 00061 __m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_g1); 00062 00063 __m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk2); 00064 __m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk2); 00065 __m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk3); 00066 __m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk3); 00067 00068 __m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk2); 00069 __m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk2); 00070 __m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk3); 00071 __m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk3); 00072 00073 __m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk2); 00074 __m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk2); 00075 __m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk3); 00076 __m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk3); 00077 00078 v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk2); 00079 v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk2); 00080 v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk3); 00081 v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk3); 00082 } 00083 00084 inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, 00085 __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) 00086 { 00087 __m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_g1); 00088 __m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_g1); 00089 __m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_b0); 00090 __m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_b0); 00091 __m128i layer1_chunk4 = _mm_unpacklo_epi8(v_g0, v_b1); 00092 __m128i layer1_chunk5 = _mm_unpackhi_epi8(v_g0, v_b1); 00093 00094 __m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk3); 00095 __m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk3); 00096 __m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk4); 00097 __m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk4); 00098 __m128i layer2_chunk4 = _mm_unpacklo_epi8(layer1_chunk2, layer1_chunk5); 00099 __m128i layer2_chunk5 = _mm_unpackhi_epi8(layer1_chunk2, layer1_chunk5); 00100 00101 __m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk3); 00102 __m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk3); 00103 __m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk4); 00104 __m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk4); 00105 __m128i layer3_chunk4 = _mm_unpacklo_epi8(layer2_chunk2, layer2_chunk5); 00106 __m128i layer3_chunk5 = _mm_unpackhi_epi8(layer2_chunk2, layer2_chunk5); 00107 00108 __m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk3); 00109 __m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk3); 00110 __m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk4); 00111 __m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk4); 00112 __m128i layer4_chunk4 = _mm_unpacklo_epi8(layer3_chunk2, layer3_chunk5); 00113 __m128i layer4_chunk5 = _mm_unpackhi_epi8(layer3_chunk2, layer3_chunk5); 00114 00115 v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk3); 00116 v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk3); 00117 v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk4); 00118 v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk4); 00119 v_b0 = _mm_unpacklo_epi8(layer4_chunk2, layer4_chunk5); 00120 v_b1 = _mm_unpackhi_epi8(layer4_chunk2, layer4_chunk5); 00121 } 00122 00123 inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, 00124 __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1) 00125 { 00126 __m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_b0); 00127 __m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_b0); 00128 __m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_b1); 00129 __m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_b1); 00130 __m128i layer1_chunk4 = _mm_unpacklo_epi8(v_g0, v_a0); 00131 __m128i layer1_chunk5 = _mm_unpackhi_epi8(v_g0, v_a0); 00132 __m128i layer1_chunk6 = _mm_unpacklo_epi8(v_g1, v_a1); 00133 __m128i layer1_chunk7 = _mm_unpackhi_epi8(v_g1, v_a1); 00134 00135 __m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk4); 00136 __m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk4); 00137 __m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk5); 00138 __m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk5); 00139 __m128i layer2_chunk4 = _mm_unpacklo_epi8(layer1_chunk2, layer1_chunk6); 00140 __m128i layer2_chunk5 = _mm_unpackhi_epi8(layer1_chunk2, layer1_chunk6); 00141 __m128i layer2_chunk6 = _mm_unpacklo_epi8(layer1_chunk3, layer1_chunk7); 00142 __m128i layer2_chunk7 = _mm_unpackhi_epi8(layer1_chunk3, layer1_chunk7); 00143 00144 __m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk4); 00145 __m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk4); 00146 __m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk5); 00147 __m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk5); 00148 __m128i layer3_chunk4 = _mm_unpacklo_epi8(layer2_chunk2, layer2_chunk6); 00149 __m128i layer3_chunk5 = _mm_unpackhi_epi8(layer2_chunk2, layer2_chunk6); 00150 __m128i layer3_chunk6 = _mm_unpacklo_epi8(layer2_chunk3, layer2_chunk7); 00151 __m128i layer3_chunk7 = _mm_unpackhi_epi8(layer2_chunk3, layer2_chunk7); 00152 00153 __m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk4); 00154 __m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk4); 00155 __m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk5); 00156 __m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk5); 00157 __m128i layer4_chunk4 = _mm_unpacklo_epi8(layer3_chunk2, layer3_chunk6); 00158 __m128i layer4_chunk5 = _mm_unpackhi_epi8(layer3_chunk2, layer3_chunk6); 00159 __m128i layer4_chunk6 = _mm_unpacklo_epi8(layer3_chunk3, layer3_chunk7); 00160 __m128i layer4_chunk7 = _mm_unpackhi_epi8(layer3_chunk3, layer3_chunk7); 00161 00162 v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk4); 00163 v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk4); 00164 v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk5); 00165 v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk5); 00166 v_b0 = _mm_unpacklo_epi8(layer4_chunk2, layer4_chunk6); 00167 v_b1 = _mm_unpackhi_epi8(layer4_chunk2, layer4_chunk6); 00168 v_a0 = _mm_unpacklo_epi8(layer4_chunk3, layer4_chunk7); 00169 v_a1 = _mm_unpackhi_epi8(layer4_chunk3, layer4_chunk7); 00170 } 00171 00172 inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1) 00173 { 00174 __m128i v_mask = _mm_set1_epi16(0x00ff); 00175 00176 __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); 00177 __m128i layer4_chunk2 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8)); 00178 __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); 00179 __m128i layer4_chunk3 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8)); 00180 00181 __m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask)); 00182 __m128i layer3_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8)); 00183 __m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask)); 00184 __m128i layer3_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8)); 00185 00186 __m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); 00187 __m128i layer2_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8)); 00188 __m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); 00189 __m128i layer2_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8)); 00190 00191 __m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); 00192 __m128i layer1_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8)); 00193 __m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); 00194 __m128i layer1_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8)); 00195 00196 v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); 00197 v_g0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8)); 00198 v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); 00199 v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8)); 00200 } 00201 00202 inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, 00203 __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) 00204 { 00205 __m128i v_mask = _mm_set1_epi16(0x00ff); 00206 00207 __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); 00208 __m128i layer4_chunk3 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8)); 00209 __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); 00210 __m128i layer4_chunk4 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8)); 00211 __m128i layer4_chunk2 = _mm_packus_epi16(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask)); 00212 __m128i layer4_chunk5 = _mm_packus_epi16(_mm_srli_epi16(v_b0, 8), _mm_srli_epi16(v_b1, 8)); 00213 00214 __m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask)); 00215 __m128i layer3_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8)); 00216 __m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask)); 00217 __m128i layer3_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8)); 00218 __m128i layer3_chunk2 = _mm_packus_epi16(_mm_and_si128(layer4_chunk4, v_mask), _mm_and_si128(layer4_chunk5, v_mask)); 00219 __m128i layer3_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk4, 8), _mm_srli_epi16(layer4_chunk5, 8)); 00220 00221 __m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); 00222 __m128i layer2_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8)); 00223 __m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); 00224 __m128i layer2_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8)); 00225 __m128i layer2_chunk2 = _mm_packus_epi16(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask)); 00226 __m128i layer2_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk4, 8), _mm_srli_epi16(layer3_chunk5, 8)); 00227 00228 __m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); 00229 __m128i layer1_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8)); 00230 __m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); 00231 __m128i layer1_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8)); 00232 __m128i layer1_chunk2 = _mm_packus_epi16(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask)); 00233 __m128i layer1_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk4, 8), _mm_srli_epi16(layer2_chunk5, 8)); 00234 00235 v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); 00236 v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8)); 00237 v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); 00238 v_b0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8)); 00239 v_g0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask)); 00240 v_b1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk4, 8), _mm_srli_epi16(layer1_chunk5, 8)); 00241 } 00242 00243 inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, 00244 __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1) 00245 { 00246 __m128i v_mask = _mm_set1_epi16(0x00ff); 00247 00248 __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); 00249 __m128i layer4_chunk4 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8)); 00250 __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); 00251 __m128i layer4_chunk5 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8)); 00252 __m128i layer4_chunk2 = _mm_packus_epi16(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask)); 00253 __m128i layer4_chunk6 = _mm_packus_epi16(_mm_srli_epi16(v_b0, 8), _mm_srli_epi16(v_b1, 8)); 00254 __m128i layer4_chunk3 = _mm_packus_epi16(_mm_and_si128(v_a0, v_mask), _mm_and_si128(v_a1, v_mask)); 00255 __m128i layer4_chunk7 = _mm_packus_epi16(_mm_srli_epi16(v_a0, 8), _mm_srli_epi16(v_a1, 8)); 00256 00257 __m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask)); 00258 __m128i layer3_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8)); 00259 __m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask)); 00260 __m128i layer3_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8)); 00261 __m128i layer3_chunk2 = _mm_packus_epi16(_mm_and_si128(layer4_chunk4, v_mask), _mm_and_si128(layer4_chunk5, v_mask)); 00262 __m128i layer3_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk4, 8), _mm_srli_epi16(layer4_chunk5, 8)); 00263 __m128i layer3_chunk3 = _mm_packus_epi16(_mm_and_si128(layer4_chunk6, v_mask), _mm_and_si128(layer4_chunk7, v_mask)); 00264 __m128i layer3_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk6, 8), _mm_srli_epi16(layer4_chunk7, 8)); 00265 00266 __m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); 00267 __m128i layer2_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8)); 00268 __m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); 00269 __m128i layer2_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8)); 00270 __m128i layer2_chunk2 = _mm_packus_epi16(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask)); 00271 __m128i layer2_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk4, 8), _mm_srli_epi16(layer3_chunk5, 8)); 00272 __m128i layer2_chunk3 = _mm_packus_epi16(_mm_and_si128(layer3_chunk6, v_mask), _mm_and_si128(layer3_chunk7, v_mask)); 00273 __m128i layer2_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk6, 8), _mm_srli_epi16(layer3_chunk7, 8)); 00274 00275 __m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); 00276 __m128i layer1_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8)); 00277 __m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); 00278 __m128i layer1_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8)); 00279 __m128i layer1_chunk2 = _mm_packus_epi16(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask)); 00280 __m128i layer1_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk4, 8), _mm_srli_epi16(layer2_chunk5, 8)); 00281 __m128i layer1_chunk3 = _mm_packus_epi16(_mm_and_si128(layer2_chunk6, v_mask), _mm_and_si128(layer2_chunk7, v_mask)); 00282 __m128i layer1_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk6, 8), _mm_srli_epi16(layer2_chunk7, 8)); 00283 00284 v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); 00285 v_b0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8)); 00286 v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); 00287 v_b1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8)); 00288 v_g0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask)); 00289 v_a0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk4, 8), _mm_srli_epi16(layer1_chunk5, 8)); 00290 v_g1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk6, v_mask), _mm_and_si128(layer1_chunk7, v_mask)); 00291 v_a1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk6, 8), _mm_srli_epi16(layer1_chunk7, 8)); 00292 } 00293 00294 inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1) 00295 { 00296 __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_g0); 00297 __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_g0); 00298 __m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_g1); 00299 __m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_g1); 00300 00301 __m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk2); 00302 __m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk2); 00303 __m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk3); 00304 __m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk3); 00305 00306 __m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk2); 00307 __m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk2); 00308 __m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk3); 00309 __m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk3); 00310 00311 v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk2); 00312 v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk2); 00313 v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk3); 00314 v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk3); 00315 } 00316 00317 inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, 00318 __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) 00319 { 00320 __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_g1); 00321 __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_g1); 00322 __m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_b0); 00323 __m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_b0); 00324 __m128i layer1_chunk4 = _mm_unpacklo_epi16(v_g0, v_b1); 00325 __m128i layer1_chunk5 = _mm_unpackhi_epi16(v_g0, v_b1); 00326 00327 __m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk3); 00328 __m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk3); 00329 __m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk4); 00330 __m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk4); 00331 __m128i layer2_chunk4 = _mm_unpacklo_epi16(layer1_chunk2, layer1_chunk5); 00332 __m128i layer2_chunk5 = _mm_unpackhi_epi16(layer1_chunk2, layer1_chunk5); 00333 00334 __m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk3); 00335 __m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk3); 00336 __m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk4); 00337 __m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk4); 00338 __m128i layer3_chunk4 = _mm_unpacklo_epi16(layer2_chunk2, layer2_chunk5); 00339 __m128i layer3_chunk5 = _mm_unpackhi_epi16(layer2_chunk2, layer2_chunk5); 00340 00341 v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk3); 00342 v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk3); 00343 v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk4); 00344 v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk4); 00345 v_b0 = _mm_unpacklo_epi16(layer3_chunk2, layer3_chunk5); 00346 v_b1 = _mm_unpackhi_epi16(layer3_chunk2, layer3_chunk5); 00347 } 00348 00349 inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, 00350 __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1) 00351 { 00352 __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_b0); 00353 __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_b0); 00354 __m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_b1); 00355 __m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_b1); 00356 __m128i layer1_chunk4 = _mm_unpacklo_epi16(v_g0, v_a0); 00357 __m128i layer1_chunk5 = _mm_unpackhi_epi16(v_g0, v_a0); 00358 __m128i layer1_chunk6 = _mm_unpacklo_epi16(v_g1, v_a1); 00359 __m128i layer1_chunk7 = _mm_unpackhi_epi16(v_g1, v_a1); 00360 00361 __m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk4); 00362 __m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk4); 00363 __m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk5); 00364 __m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk5); 00365 __m128i layer2_chunk4 = _mm_unpacklo_epi16(layer1_chunk2, layer1_chunk6); 00366 __m128i layer2_chunk5 = _mm_unpackhi_epi16(layer1_chunk2, layer1_chunk6); 00367 __m128i layer2_chunk6 = _mm_unpacklo_epi16(layer1_chunk3, layer1_chunk7); 00368 __m128i layer2_chunk7 = _mm_unpackhi_epi16(layer1_chunk3, layer1_chunk7); 00369 00370 __m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk4); 00371 __m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk4); 00372 __m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk5); 00373 __m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk5); 00374 __m128i layer3_chunk4 = _mm_unpacklo_epi16(layer2_chunk2, layer2_chunk6); 00375 __m128i layer3_chunk5 = _mm_unpackhi_epi16(layer2_chunk2, layer2_chunk6); 00376 __m128i layer3_chunk6 = _mm_unpacklo_epi16(layer2_chunk3, layer2_chunk7); 00377 __m128i layer3_chunk7 = _mm_unpackhi_epi16(layer2_chunk3, layer2_chunk7); 00378 00379 v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk4); 00380 v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk4); 00381 v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk5); 00382 v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk5); 00383 v_b0 = _mm_unpacklo_epi16(layer3_chunk2, layer3_chunk6); 00384 v_b1 = _mm_unpackhi_epi16(layer3_chunk2, layer3_chunk6); 00385 v_a0 = _mm_unpacklo_epi16(layer3_chunk3, layer3_chunk7); 00386 v_a1 = _mm_unpackhi_epi16(layer3_chunk3, layer3_chunk7); 00387 } 00388 00389 #if CV_SSE4_1 00390 00391 inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1) 00392 { 00393 __m128i v_mask = _mm_set1_epi32(0x0000ffff); 00394 00395 __m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); 00396 __m128i layer3_chunk2 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16)); 00397 __m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); 00398 __m128i layer3_chunk3 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16)); 00399 00400 __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); 00401 __m128i layer2_chunk2 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16)); 00402 __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); 00403 __m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16)); 00404 00405 __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); 00406 __m128i layer1_chunk2 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16)); 00407 __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); 00408 __m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16)); 00409 00410 v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); 00411 v_g0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16)); 00412 v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); 00413 v_g1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16)); 00414 } 00415 00416 inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, 00417 __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) 00418 { 00419 __m128i v_mask = _mm_set1_epi32(0x0000ffff); 00420 00421 __m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); 00422 __m128i layer3_chunk3 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16)); 00423 __m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); 00424 __m128i layer3_chunk4 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16)); 00425 __m128i layer3_chunk2 = _mm_packus_epi32(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask)); 00426 __m128i layer3_chunk5 = _mm_packus_epi32(_mm_srli_epi32(v_b0, 16), _mm_srli_epi32(v_b1, 16)); 00427 00428 __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); 00429 __m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16)); 00430 __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); 00431 __m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16)); 00432 __m128i layer2_chunk2 = _mm_packus_epi32(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask)); 00433 __m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16)); 00434 00435 __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); 00436 __m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16)); 00437 __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); 00438 __m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16)); 00439 __m128i layer1_chunk2 = _mm_packus_epi32(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask)); 00440 __m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16)); 00441 00442 v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); 00443 v_g1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16)); 00444 v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); 00445 v_b0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16)); 00446 v_g0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask)); 00447 v_b1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk4, 16), _mm_srli_epi32(layer1_chunk5, 16)); 00448 } 00449 00450 inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, 00451 __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1) 00452 { 00453 __m128i v_mask = _mm_set1_epi32(0x0000ffff); 00454 00455 __m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); 00456 __m128i layer3_chunk4 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16)); 00457 __m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); 00458 __m128i layer3_chunk5 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16)); 00459 __m128i layer3_chunk2 = _mm_packus_epi32(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask)); 00460 __m128i layer3_chunk6 = _mm_packus_epi32(_mm_srli_epi32(v_b0, 16), _mm_srli_epi32(v_b1, 16)); 00461 __m128i layer3_chunk3 = _mm_packus_epi32(_mm_and_si128(v_a0, v_mask), _mm_and_si128(v_a1, v_mask)); 00462 __m128i layer3_chunk7 = _mm_packus_epi32(_mm_srli_epi32(v_a0, 16), _mm_srli_epi32(v_a1, 16)); 00463 00464 __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); 00465 __m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16)); 00466 __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); 00467 __m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16)); 00468 __m128i layer2_chunk2 = _mm_packus_epi32(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask)); 00469 __m128i layer2_chunk6 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16)); 00470 __m128i layer2_chunk3 = _mm_packus_epi32(_mm_and_si128(layer3_chunk6, v_mask), _mm_and_si128(layer3_chunk7, v_mask)); 00471 __m128i layer2_chunk7 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk6, 16), _mm_srli_epi32(layer3_chunk7, 16)); 00472 00473 __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); 00474 __m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16)); 00475 __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); 00476 __m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16)); 00477 __m128i layer1_chunk2 = _mm_packus_epi32(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask)); 00478 __m128i layer1_chunk6 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16)); 00479 __m128i layer1_chunk3 = _mm_packus_epi32(_mm_and_si128(layer2_chunk6, v_mask), _mm_and_si128(layer2_chunk7, v_mask)); 00480 __m128i layer1_chunk7 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk6, 16), _mm_srli_epi32(layer2_chunk7, 16)); 00481 00482 v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); 00483 v_b0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16)); 00484 v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); 00485 v_b1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16)); 00486 v_g0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask)); 00487 v_a0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk4, 16), _mm_srli_epi32(layer1_chunk5, 16)); 00488 v_g1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk6, v_mask), _mm_and_si128(layer1_chunk7, v_mask)); 00489 v_a1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk6, 16), _mm_srli_epi32(layer1_chunk7, 16)); 00490 } 00491 00492 #endif // CV_SSE4_1 00493 00494 inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1) 00495 { 00496 __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_g0); 00497 __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_g0); 00498 __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_g1); 00499 __m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_g1); 00500 00501 __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk2); 00502 __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk2); 00503 __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk3); 00504 __m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk3); 00505 00506 v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk2); 00507 v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk2); 00508 v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk3); 00509 v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk3); 00510 } 00511 00512 inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, 00513 __m128 & v_g1, __m128 & v_b0, __m128 & v_b1) 00514 { 00515 __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_g1); 00516 __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_g1); 00517 __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_b0); 00518 __m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_b0); 00519 __m128 layer1_chunk4 = _mm_unpacklo_ps(v_g0, v_b1); 00520 __m128 layer1_chunk5 = _mm_unpackhi_ps(v_g0, v_b1); 00521 00522 __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk3); 00523 __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk3); 00524 __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk4); 00525 __m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk4); 00526 __m128 layer2_chunk4 = _mm_unpacklo_ps(layer1_chunk2, layer1_chunk5); 00527 __m128 layer2_chunk5 = _mm_unpackhi_ps(layer1_chunk2, layer1_chunk5); 00528 00529 v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk3); 00530 v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk3); 00531 v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk4); 00532 v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk4); 00533 v_b0 = _mm_unpacklo_ps(layer2_chunk2, layer2_chunk5); 00534 v_b1 = _mm_unpackhi_ps(layer2_chunk2, layer2_chunk5); 00535 } 00536 00537 inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1, 00538 __m128 & v_b0, __m128 & v_b1, __m128 & v_a0, __m128 & v_a1) 00539 { 00540 __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_b0); 00541 __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_b0); 00542 __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_b1); 00543 __m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_b1); 00544 __m128 layer1_chunk4 = _mm_unpacklo_ps(v_g0, v_a0); 00545 __m128 layer1_chunk5 = _mm_unpackhi_ps(v_g0, v_a0); 00546 __m128 layer1_chunk6 = _mm_unpacklo_ps(v_g1, v_a1); 00547 __m128 layer1_chunk7 = _mm_unpackhi_ps(v_g1, v_a1); 00548 00549 __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk4); 00550 __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk4); 00551 __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk5); 00552 __m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk5); 00553 __m128 layer2_chunk4 = _mm_unpacklo_ps(layer1_chunk2, layer1_chunk6); 00554 __m128 layer2_chunk5 = _mm_unpackhi_ps(layer1_chunk2, layer1_chunk6); 00555 __m128 layer2_chunk6 = _mm_unpacklo_ps(layer1_chunk3, layer1_chunk7); 00556 __m128 layer2_chunk7 = _mm_unpackhi_ps(layer1_chunk3, layer1_chunk7); 00557 00558 v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk4); 00559 v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk4); 00560 v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk5); 00561 v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk5); 00562 v_b0 = _mm_unpacklo_ps(layer2_chunk2, layer2_chunk6); 00563 v_b1 = _mm_unpackhi_ps(layer2_chunk2, layer2_chunk6); 00564 v_a0 = _mm_unpacklo_ps(layer2_chunk3, layer2_chunk7); 00565 v_a1 = _mm_unpackhi_ps(layer2_chunk3, layer2_chunk7); 00566 } 00567 00568 inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1) 00569 { 00570 const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1); 00571 00572 __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo); 00573 __m128 layer2_chunk2 = _mm_shuffle_ps(v_r0, v_r1, mask_hi); 00574 __m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo); 00575 __m128 layer2_chunk3 = _mm_shuffle_ps(v_g0, v_g1, mask_hi); 00576 00577 __m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo); 00578 __m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi); 00579 __m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo); 00580 __m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi); 00581 00582 v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo); 00583 v_g0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi); 00584 v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo); 00585 v_g1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi); 00586 } 00587 00588 inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, 00589 __m128 & v_g1, __m128 & v_b0, __m128 & v_b1) 00590 { 00591 const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1); 00592 00593 __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo); 00594 __m128 layer2_chunk3 = _mm_shuffle_ps(v_r0, v_r1, mask_hi); 00595 __m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo); 00596 __m128 layer2_chunk4 = _mm_shuffle_ps(v_g0, v_g1, mask_hi); 00597 __m128 layer2_chunk2 = _mm_shuffle_ps(v_b0, v_b1, mask_lo); 00598 __m128 layer2_chunk5 = _mm_shuffle_ps(v_b0, v_b1, mask_hi); 00599 00600 __m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo); 00601 __m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi); 00602 __m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo); 00603 __m128 layer1_chunk4 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi); 00604 __m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_lo); 00605 __m128 layer1_chunk5 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_hi); 00606 00607 v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo); 00608 v_g1 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi); 00609 v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo); 00610 v_b0 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi); 00611 v_g0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_lo); 00612 v_b1 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_hi); 00613 } 00614 00615 inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1, 00616 __m128 & v_b0, __m128 & v_b1, __m128 & v_a0, __m128 & v_a1) 00617 { 00618 const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1); 00619 00620 __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo); 00621 __m128 layer2_chunk4 = _mm_shuffle_ps(v_r0, v_r1, mask_hi); 00622 __m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo); 00623 __m128 layer2_chunk5 = _mm_shuffle_ps(v_g0, v_g1, mask_hi); 00624 __m128 layer2_chunk2 = _mm_shuffle_ps(v_b0, v_b1, mask_lo); 00625 __m128 layer2_chunk6 = _mm_shuffle_ps(v_b0, v_b1, mask_hi); 00626 __m128 layer2_chunk3 = _mm_shuffle_ps(v_a0, v_a1, mask_lo); 00627 __m128 layer2_chunk7 = _mm_shuffle_ps(v_a0, v_a1, mask_hi); 00628 00629 __m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo); 00630 __m128 layer1_chunk4 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi); 00631 __m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo); 00632 __m128 layer1_chunk5 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi); 00633 __m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_lo); 00634 __m128 layer1_chunk6 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_hi); 00635 __m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk6, layer2_chunk7, mask_lo); 00636 __m128 layer1_chunk7 = _mm_shuffle_ps(layer2_chunk6, layer2_chunk7, mask_hi); 00637 00638 v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo); 00639 v_b0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi); 00640 v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo); 00641 v_b1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi); 00642 v_g0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_lo); 00643 v_a0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_hi); 00644 v_g1 = _mm_shuffle_ps(layer1_chunk6, layer1_chunk7, mask_lo); 00645 v_a1 = _mm_shuffle_ps(layer1_chunk6, layer1_chunk7, mask_hi); 00646 } 00647 00648 #endif // CV_SSE2 00649 00650 //! @} 00651 00652 #endif //__OPENCV_CORE_SSE_UTILS_HPP__ 00653
Generated on Tue Jul 12 2022 14:47:37 by
1.7.2
