opencv on mbed

Dependencies:   mbed

Committer:
joeverbout
Date:
Thu Mar 31 21:16:38 2016 +0000
Revision:
0:ea44dc9ed014
OpenCV on mbed attempt

Who changed what in which revision?

UserRevisionLine numberNew contents of line
joeverbout 0:ea44dc9ed014 1 /*M///////////////////////////////////////////////////////////////////////////////////////
joeverbout 0:ea44dc9ed014 2 //
joeverbout 0:ea44dc9ed014 3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
joeverbout 0:ea44dc9ed014 4 //
joeverbout 0:ea44dc9ed014 5 // By downloading, copying, installing or using the software you agree to this license.
joeverbout 0:ea44dc9ed014 6 // If you do not agree to this license, do not download, install,
joeverbout 0:ea44dc9ed014 7 // copy or use the software.
joeverbout 0:ea44dc9ed014 8 //
joeverbout 0:ea44dc9ed014 9 //
joeverbout 0:ea44dc9ed014 10 // License Agreement
joeverbout 0:ea44dc9ed014 11 // For Open Source Computer Vision Library
joeverbout 0:ea44dc9ed014 12 //
joeverbout 0:ea44dc9ed014 13 // Copyright (C) 2015, Itseez Inc., all rights reserved.
joeverbout 0:ea44dc9ed014 14 // Third party copyrights are property of their respective owners.
joeverbout 0:ea44dc9ed014 15 //
joeverbout 0:ea44dc9ed014 16 // Redistribution and use in source and binary forms, with or without modification,
joeverbout 0:ea44dc9ed014 17 // are permitted provided that the following conditions are met:
joeverbout 0:ea44dc9ed014 18 //
joeverbout 0:ea44dc9ed014 19 // * Redistribution's of source code must retain the above copyright notice,
joeverbout 0:ea44dc9ed014 20 // this list of conditions and the following disclaimer.
joeverbout 0:ea44dc9ed014 21 //
joeverbout 0:ea44dc9ed014 22 // * Redistribution's in binary form must reproduce the above copyright notice,
joeverbout 0:ea44dc9ed014 23 // this list of conditions and the following disclaimer in the documentation
joeverbout 0:ea44dc9ed014 24 // and/or other materials provided with the distribution.
joeverbout 0:ea44dc9ed014 25 //
joeverbout 0:ea44dc9ed014 26 // * The name of the copyright holders may not be used to endorse or promote products
joeverbout 0:ea44dc9ed014 27 // derived from this software without specific prior written permission.
joeverbout 0:ea44dc9ed014 28 //
joeverbout 0:ea44dc9ed014 29 // This software is provided by the copyright holders and contributors "as is" and
joeverbout 0:ea44dc9ed014 30 // any express or implied warranties, including, but not limited to, the implied
joeverbout 0:ea44dc9ed014 31 // warranties of merchantability and fitness for a particular purpose are disclaimed.
joeverbout 0:ea44dc9ed014 32 // In no event shall the Intel Corporation or contributors be liable for any direct,
joeverbout 0:ea44dc9ed014 33 // indirect, incidental, special, exemplary, or consequential damages
joeverbout 0:ea44dc9ed014 34 // (including, but not limited to, procurement of substitute goods or services;
joeverbout 0:ea44dc9ed014 35 // loss of use, data, or profits; or business interruption) however caused
joeverbout 0:ea44dc9ed014 36 // and on any theory of liability, whether in contract, strict liability,
joeverbout 0:ea44dc9ed014 37 // or tort (including negligence or otherwise) arising in any way out of
joeverbout 0:ea44dc9ed014 38 // the use of this software, even if advised of the possibility of such damage.
joeverbout 0:ea44dc9ed014 39 //
joeverbout 0:ea44dc9ed014 40 //M*/
joeverbout 0:ea44dc9ed014 41
joeverbout 0:ea44dc9ed014 42 #ifndef __OPENCV_CORE_SSE_UTILS_HPP__
joeverbout 0:ea44dc9ed014 43 #define __OPENCV_CORE_SSE_UTILS_HPP__
joeverbout 0:ea44dc9ed014 44
joeverbout 0:ea44dc9ed014 45 #ifndef __cplusplus
joeverbout 0:ea44dc9ed014 46 # error sse_utils.hpp header must be compiled as C++
joeverbout 0:ea44dc9ed014 47 #endif
joeverbout 0:ea44dc9ed014 48
joeverbout 0:ea44dc9ed014 49 #include "opencv2/core/cvdef.h"
joeverbout 0:ea44dc9ed014 50
joeverbout 0:ea44dc9ed014 51 //! @addtogroup core_utils_sse
joeverbout 0:ea44dc9ed014 52 //! @{
joeverbout 0:ea44dc9ed014 53
joeverbout 0:ea44dc9ed014 54 #if CV_SSE2
joeverbout 0:ea44dc9ed014 55
joeverbout 0:ea44dc9ed014 56 inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
joeverbout 0:ea44dc9ed014 57 {
joeverbout 0:ea44dc9ed014 58 __m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_g0);
joeverbout 0:ea44dc9ed014 59 __m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_g0);
joeverbout 0:ea44dc9ed014 60 __m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_g1);
joeverbout 0:ea44dc9ed014 61 __m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_g1);
joeverbout 0:ea44dc9ed014 62
joeverbout 0:ea44dc9ed014 63 __m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk2);
joeverbout 0:ea44dc9ed014 64 __m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk2);
joeverbout 0:ea44dc9ed014 65 __m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk3);
joeverbout 0:ea44dc9ed014 66 __m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk3);
joeverbout 0:ea44dc9ed014 67
joeverbout 0:ea44dc9ed014 68 __m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk2);
joeverbout 0:ea44dc9ed014 69 __m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk2);
joeverbout 0:ea44dc9ed014 70 __m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk3);
joeverbout 0:ea44dc9ed014 71 __m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk3);
joeverbout 0:ea44dc9ed014 72
joeverbout 0:ea44dc9ed014 73 __m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk2);
joeverbout 0:ea44dc9ed014 74 __m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk2);
joeverbout 0:ea44dc9ed014 75 __m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk3);
joeverbout 0:ea44dc9ed014 76 __m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk3);
joeverbout 0:ea44dc9ed014 77
joeverbout 0:ea44dc9ed014 78 v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk2);
joeverbout 0:ea44dc9ed014 79 v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk2);
joeverbout 0:ea44dc9ed014 80 v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk3);
joeverbout 0:ea44dc9ed014 81 v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk3);
joeverbout 0:ea44dc9ed014 82 }
joeverbout 0:ea44dc9ed014 83
joeverbout 0:ea44dc9ed014 84 inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
joeverbout 0:ea44dc9ed014 85 __m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
joeverbout 0:ea44dc9ed014 86 {
joeverbout 0:ea44dc9ed014 87 __m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_g1);
joeverbout 0:ea44dc9ed014 88 __m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_g1);
joeverbout 0:ea44dc9ed014 89 __m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_b0);
joeverbout 0:ea44dc9ed014 90 __m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_b0);
joeverbout 0:ea44dc9ed014 91 __m128i layer1_chunk4 = _mm_unpacklo_epi8(v_g0, v_b1);
joeverbout 0:ea44dc9ed014 92 __m128i layer1_chunk5 = _mm_unpackhi_epi8(v_g0, v_b1);
joeverbout 0:ea44dc9ed014 93
joeverbout 0:ea44dc9ed014 94 __m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk3);
joeverbout 0:ea44dc9ed014 95 __m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk3);
joeverbout 0:ea44dc9ed014 96 __m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk4);
joeverbout 0:ea44dc9ed014 97 __m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk4);
joeverbout 0:ea44dc9ed014 98 __m128i layer2_chunk4 = _mm_unpacklo_epi8(layer1_chunk2, layer1_chunk5);
joeverbout 0:ea44dc9ed014 99 __m128i layer2_chunk5 = _mm_unpackhi_epi8(layer1_chunk2, layer1_chunk5);
joeverbout 0:ea44dc9ed014 100
joeverbout 0:ea44dc9ed014 101 __m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk3);
joeverbout 0:ea44dc9ed014 102 __m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk3);
joeverbout 0:ea44dc9ed014 103 __m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk4);
joeverbout 0:ea44dc9ed014 104 __m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk4);
joeverbout 0:ea44dc9ed014 105 __m128i layer3_chunk4 = _mm_unpacklo_epi8(layer2_chunk2, layer2_chunk5);
joeverbout 0:ea44dc9ed014 106 __m128i layer3_chunk5 = _mm_unpackhi_epi8(layer2_chunk2, layer2_chunk5);
joeverbout 0:ea44dc9ed014 107
joeverbout 0:ea44dc9ed014 108 __m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk3);
joeverbout 0:ea44dc9ed014 109 __m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk3);
joeverbout 0:ea44dc9ed014 110 __m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk4);
joeverbout 0:ea44dc9ed014 111 __m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk4);
joeverbout 0:ea44dc9ed014 112 __m128i layer4_chunk4 = _mm_unpacklo_epi8(layer3_chunk2, layer3_chunk5);
joeverbout 0:ea44dc9ed014 113 __m128i layer4_chunk5 = _mm_unpackhi_epi8(layer3_chunk2, layer3_chunk5);
joeverbout 0:ea44dc9ed014 114
joeverbout 0:ea44dc9ed014 115 v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk3);
joeverbout 0:ea44dc9ed014 116 v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk3);
joeverbout 0:ea44dc9ed014 117 v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk4);
joeverbout 0:ea44dc9ed014 118 v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk4);
joeverbout 0:ea44dc9ed014 119 v_b0 = _mm_unpacklo_epi8(layer4_chunk2, layer4_chunk5);
joeverbout 0:ea44dc9ed014 120 v_b1 = _mm_unpackhi_epi8(layer4_chunk2, layer4_chunk5);
joeverbout 0:ea44dc9ed014 121 }
joeverbout 0:ea44dc9ed014 122
joeverbout 0:ea44dc9ed014 123 inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
joeverbout 0:ea44dc9ed014 124 __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
joeverbout 0:ea44dc9ed014 125 {
joeverbout 0:ea44dc9ed014 126 __m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_b0);
joeverbout 0:ea44dc9ed014 127 __m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_b0);
joeverbout 0:ea44dc9ed014 128 __m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_b1);
joeverbout 0:ea44dc9ed014 129 __m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_b1);
joeverbout 0:ea44dc9ed014 130 __m128i layer1_chunk4 = _mm_unpacklo_epi8(v_g0, v_a0);
joeverbout 0:ea44dc9ed014 131 __m128i layer1_chunk5 = _mm_unpackhi_epi8(v_g0, v_a0);
joeverbout 0:ea44dc9ed014 132 __m128i layer1_chunk6 = _mm_unpacklo_epi8(v_g1, v_a1);
joeverbout 0:ea44dc9ed014 133 __m128i layer1_chunk7 = _mm_unpackhi_epi8(v_g1, v_a1);
joeverbout 0:ea44dc9ed014 134
joeverbout 0:ea44dc9ed014 135 __m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk4);
joeverbout 0:ea44dc9ed014 136 __m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk4);
joeverbout 0:ea44dc9ed014 137 __m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk5);
joeverbout 0:ea44dc9ed014 138 __m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk5);
joeverbout 0:ea44dc9ed014 139 __m128i layer2_chunk4 = _mm_unpacklo_epi8(layer1_chunk2, layer1_chunk6);
joeverbout 0:ea44dc9ed014 140 __m128i layer2_chunk5 = _mm_unpackhi_epi8(layer1_chunk2, layer1_chunk6);
joeverbout 0:ea44dc9ed014 141 __m128i layer2_chunk6 = _mm_unpacklo_epi8(layer1_chunk3, layer1_chunk7);
joeverbout 0:ea44dc9ed014 142 __m128i layer2_chunk7 = _mm_unpackhi_epi8(layer1_chunk3, layer1_chunk7);
joeverbout 0:ea44dc9ed014 143
joeverbout 0:ea44dc9ed014 144 __m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk4);
joeverbout 0:ea44dc9ed014 145 __m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk4);
joeverbout 0:ea44dc9ed014 146 __m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk5);
joeverbout 0:ea44dc9ed014 147 __m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk5);
joeverbout 0:ea44dc9ed014 148 __m128i layer3_chunk4 = _mm_unpacklo_epi8(layer2_chunk2, layer2_chunk6);
joeverbout 0:ea44dc9ed014 149 __m128i layer3_chunk5 = _mm_unpackhi_epi8(layer2_chunk2, layer2_chunk6);
joeverbout 0:ea44dc9ed014 150 __m128i layer3_chunk6 = _mm_unpacklo_epi8(layer2_chunk3, layer2_chunk7);
joeverbout 0:ea44dc9ed014 151 __m128i layer3_chunk7 = _mm_unpackhi_epi8(layer2_chunk3, layer2_chunk7);
joeverbout 0:ea44dc9ed014 152
joeverbout 0:ea44dc9ed014 153 __m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk4);
joeverbout 0:ea44dc9ed014 154 __m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk4);
joeverbout 0:ea44dc9ed014 155 __m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk5);
joeverbout 0:ea44dc9ed014 156 __m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk5);
joeverbout 0:ea44dc9ed014 157 __m128i layer4_chunk4 = _mm_unpacklo_epi8(layer3_chunk2, layer3_chunk6);
joeverbout 0:ea44dc9ed014 158 __m128i layer4_chunk5 = _mm_unpackhi_epi8(layer3_chunk2, layer3_chunk6);
joeverbout 0:ea44dc9ed014 159 __m128i layer4_chunk6 = _mm_unpacklo_epi8(layer3_chunk3, layer3_chunk7);
joeverbout 0:ea44dc9ed014 160 __m128i layer4_chunk7 = _mm_unpackhi_epi8(layer3_chunk3, layer3_chunk7);
joeverbout 0:ea44dc9ed014 161
joeverbout 0:ea44dc9ed014 162 v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk4);
joeverbout 0:ea44dc9ed014 163 v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk4);
joeverbout 0:ea44dc9ed014 164 v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk5);
joeverbout 0:ea44dc9ed014 165 v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk5);
joeverbout 0:ea44dc9ed014 166 v_b0 = _mm_unpacklo_epi8(layer4_chunk2, layer4_chunk6);
joeverbout 0:ea44dc9ed014 167 v_b1 = _mm_unpackhi_epi8(layer4_chunk2, layer4_chunk6);
joeverbout 0:ea44dc9ed014 168 v_a0 = _mm_unpacklo_epi8(layer4_chunk3, layer4_chunk7);
joeverbout 0:ea44dc9ed014 169 v_a1 = _mm_unpackhi_epi8(layer4_chunk3, layer4_chunk7);
joeverbout 0:ea44dc9ed014 170 }
joeverbout 0:ea44dc9ed014 171
joeverbout 0:ea44dc9ed014 172 inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
joeverbout 0:ea44dc9ed014 173 {
joeverbout 0:ea44dc9ed014 174 __m128i v_mask = _mm_set1_epi16(0x00ff);
joeverbout 0:ea44dc9ed014 175
joeverbout 0:ea44dc9ed014 176 __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
joeverbout 0:ea44dc9ed014 177 __m128i layer4_chunk2 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8));
joeverbout 0:ea44dc9ed014 178 __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
joeverbout 0:ea44dc9ed014 179 __m128i layer4_chunk3 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8));
joeverbout 0:ea44dc9ed014 180
joeverbout 0:ea44dc9ed014 181 __m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask));
joeverbout 0:ea44dc9ed014 182 __m128i layer3_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8));
joeverbout 0:ea44dc9ed014 183 __m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask));
joeverbout 0:ea44dc9ed014 184 __m128i layer3_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8));
joeverbout 0:ea44dc9ed014 185
joeverbout 0:ea44dc9ed014 186 __m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
joeverbout 0:ea44dc9ed014 187 __m128i layer2_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8));
joeverbout 0:ea44dc9ed014 188 __m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
joeverbout 0:ea44dc9ed014 189 __m128i layer2_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8));
joeverbout 0:ea44dc9ed014 190
joeverbout 0:ea44dc9ed014 191 __m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
joeverbout 0:ea44dc9ed014 192 __m128i layer1_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8));
joeverbout 0:ea44dc9ed014 193 __m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
joeverbout 0:ea44dc9ed014 194 __m128i layer1_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8));
joeverbout 0:ea44dc9ed014 195
joeverbout 0:ea44dc9ed014 196 v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
joeverbout 0:ea44dc9ed014 197 v_g0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8));
joeverbout 0:ea44dc9ed014 198 v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
joeverbout 0:ea44dc9ed014 199 v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8));
joeverbout 0:ea44dc9ed014 200 }
joeverbout 0:ea44dc9ed014 201
joeverbout 0:ea44dc9ed014 202 inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
joeverbout 0:ea44dc9ed014 203 __m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
joeverbout 0:ea44dc9ed014 204 {
joeverbout 0:ea44dc9ed014 205 __m128i v_mask = _mm_set1_epi16(0x00ff);
joeverbout 0:ea44dc9ed014 206
joeverbout 0:ea44dc9ed014 207 __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
joeverbout 0:ea44dc9ed014 208 __m128i layer4_chunk3 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8));
joeverbout 0:ea44dc9ed014 209 __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
joeverbout 0:ea44dc9ed014 210 __m128i layer4_chunk4 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8));
joeverbout 0:ea44dc9ed014 211 __m128i layer4_chunk2 = _mm_packus_epi16(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
joeverbout 0:ea44dc9ed014 212 __m128i layer4_chunk5 = _mm_packus_epi16(_mm_srli_epi16(v_b0, 8), _mm_srli_epi16(v_b1, 8));
joeverbout 0:ea44dc9ed014 213
joeverbout 0:ea44dc9ed014 214 __m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask));
joeverbout 0:ea44dc9ed014 215 __m128i layer3_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8));
joeverbout 0:ea44dc9ed014 216 __m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask));
joeverbout 0:ea44dc9ed014 217 __m128i layer3_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8));
joeverbout 0:ea44dc9ed014 218 __m128i layer3_chunk2 = _mm_packus_epi16(_mm_and_si128(layer4_chunk4, v_mask), _mm_and_si128(layer4_chunk5, v_mask));
joeverbout 0:ea44dc9ed014 219 __m128i layer3_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk4, 8), _mm_srli_epi16(layer4_chunk5, 8));
joeverbout 0:ea44dc9ed014 220
joeverbout 0:ea44dc9ed014 221 __m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
joeverbout 0:ea44dc9ed014 222 __m128i layer2_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8));
joeverbout 0:ea44dc9ed014 223 __m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
joeverbout 0:ea44dc9ed014 224 __m128i layer2_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8));
joeverbout 0:ea44dc9ed014 225 __m128i layer2_chunk2 = _mm_packus_epi16(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
joeverbout 0:ea44dc9ed014 226 __m128i layer2_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk4, 8), _mm_srli_epi16(layer3_chunk5, 8));
joeverbout 0:ea44dc9ed014 227
joeverbout 0:ea44dc9ed014 228 __m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
joeverbout 0:ea44dc9ed014 229 __m128i layer1_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8));
joeverbout 0:ea44dc9ed014 230 __m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
joeverbout 0:ea44dc9ed014 231 __m128i layer1_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8));
joeverbout 0:ea44dc9ed014 232 __m128i layer1_chunk2 = _mm_packus_epi16(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
joeverbout 0:ea44dc9ed014 233 __m128i layer1_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk4, 8), _mm_srli_epi16(layer2_chunk5, 8));
joeverbout 0:ea44dc9ed014 234
joeverbout 0:ea44dc9ed014 235 v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
joeverbout 0:ea44dc9ed014 236 v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8));
joeverbout 0:ea44dc9ed014 237 v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
joeverbout 0:ea44dc9ed014 238 v_b0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8));
joeverbout 0:ea44dc9ed014 239 v_g0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
joeverbout 0:ea44dc9ed014 240 v_b1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk4, 8), _mm_srli_epi16(layer1_chunk5, 8));
joeverbout 0:ea44dc9ed014 241 }
joeverbout 0:ea44dc9ed014 242
joeverbout 0:ea44dc9ed014 243 inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
joeverbout 0:ea44dc9ed014 244 __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
joeverbout 0:ea44dc9ed014 245 {
joeverbout 0:ea44dc9ed014 246 __m128i v_mask = _mm_set1_epi16(0x00ff);
joeverbout 0:ea44dc9ed014 247
joeverbout 0:ea44dc9ed014 248 __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
joeverbout 0:ea44dc9ed014 249 __m128i layer4_chunk4 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8));
joeverbout 0:ea44dc9ed014 250 __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
joeverbout 0:ea44dc9ed014 251 __m128i layer4_chunk5 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8));
joeverbout 0:ea44dc9ed014 252 __m128i layer4_chunk2 = _mm_packus_epi16(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
joeverbout 0:ea44dc9ed014 253 __m128i layer4_chunk6 = _mm_packus_epi16(_mm_srli_epi16(v_b0, 8), _mm_srli_epi16(v_b1, 8));
joeverbout 0:ea44dc9ed014 254 __m128i layer4_chunk3 = _mm_packus_epi16(_mm_and_si128(v_a0, v_mask), _mm_and_si128(v_a1, v_mask));
joeverbout 0:ea44dc9ed014 255 __m128i layer4_chunk7 = _mm_packus_epi16(_mm_srli_epi16(v_a0, 8), _mm_srli_epi16(v_a1, 8));
joeverbout 0:ea44dc9ed014 256
joeverbout 0:ea44dc9ed014 257 __m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask));
joeverbout 0:ea44dc9ed014 258 __m128i layer3_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8));
joeverbout 0:ea44dc9ed014 259 __m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask));
joeverbout 0:ea44dc9ed014 260 __m128i layer3_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8));
joeverbout 0:ea44dc9ed014 261 __m128i layer3_chunk2 = _mm_packus_epi16(_mm_and_si128(layer4_chunk4, v_mask), _mm_and_si128(layer4_chunk5, v_mask));
joeverbout 0:ea44dc9ed014 262 __m128i layer3_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk4, 8), _mm_srli_epi16(layer4_chunk5, 8));
joeverbout 0:ea44dc9ed014 263 __m128i layer3_chunk3 = _mm_packus_epi16(_mm_and_si128(layer4_chunk6, v_mask), _mm_and_si128(layer4_chunk7, v_mask));
joeverbout 0:ea44dc9ed014 264 __m128i layer3_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk6, 8), _mm_srli_epi16(layer4_chunk7, 8));
joeverbout 0:ea44dc9ed014 265
joeverbout 0:ea44dc9ed014 266 __m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
joeverbout 0:ea44dc9ed014 267 __m128i layer2_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8));
joeverbout 0:ea44dc9ed014 268 __m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
joeverbout 0:ea44dc9ed014 269 __m128i layer2_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8));
joeverbout 0:ea44dc9ed014 270 __m128i layer2_chunk2 = _mm_packus_epi16(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
joeverbout 0:ea44dc9ed014 271 __m128i layer2_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk4, 8), _mm_srli_epi16(layer3_chunk5, 8));
joeverbout 0:ea44dc9ed014 272 __m128i layer2_chunk3 = _mm_packus_epi16(_mm_and_si128(layer3_chunk6, v_mask), _mm_and_si128(layer3_chunk7, v_mask));
joeverbout 0:ea44dc9ed014 273 __m128i layer2_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk6, 8), _mm_srli_epi16(layer3_chunk7, 8));
joeverbout 0:ea44dc9ed014 274
joeverbout 0:ea44dc9ed014 275 __m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
joeverbout 0:ea44dc9ed014 276 __m128i layer1_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8));
joeverbout 0:ea44dc9ed014 277 __m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
joeverbout 0:ea44dc9ed014 278 __m128i layer1_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8));
joeverbout 0:ea44dc9ed014 279 __m128i layer1_chunk2 = _mm_packus_epi16(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
joeverbout 0:ea44dc9ed014 280 __m128i layer1_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk4, 8), _mm_srli_epi16(layer2_chunk5, 8));
joeverbout 0:ea44dc9ed014 281 __m128i layer1_chunk3 = _mm_packus_epi16(_mm_and_si128(layer2_chunk6, v_mask), _mm_and_si128(layer2_chunk7, v_mask));
joeverbout 0:ea44dc9ed014 282 __m128i layer1_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk6, 8), _mm_srli_epi16(layer2_chunk7, 8));
joeverbout 0:ea44dc9ed014 283
joeverbout 0:ea44dc9ed014 284 v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
joeverbout 0:ea44dc9ed014 285 v_b0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8));
joeverbout 0:ea44dc9ed014 286 v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
joeverbout 0:ea44dc9ed014 287 v_b1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8));
joeverbout 0:ea44dc9ed014 288 v_g0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
joeverbout 0:ea44dc9ed014 289 v_a0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk4, 8), _mm_srli_epi16(layer1_chunk5, 8));
joeverbout 0:ea44dc9ed014 290 v_g1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk6, v_mask), _mm_and_si128(layer1_chunk7, v_mask));
joeverbout 0:ea44dc9ed014 291 v_a1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk6, 8), _mm_srli_epi16(layer1_chunk7, 8));
joeverbout 0:ea44dc9ed014 292 }
joeverbout 0:ea44dc9ed014 293
joeverbout 0:ea44dc9ed014 294 inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
joeverbout 0:ea44dc9ed014 295 {
joeverbout 0:ea44dc9ed014 296 __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_g0);
joeverbout 0:ea44dc9ed014 297 __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_g0);
joeverbout 0:ea44dc9ed014 298 __m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_g1);
joeverbout 0:ea44dc9ed014 299 __m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_g1);
joeverbout 0:ea44dc9ed014 300
joeverbout 0:ea44dc9ed014 301 __m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk2);
joeverbout 0:ea44dc9ed014 302 __m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk2);
joeverbout 0:ea44dc9ed014 303 __m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk3);
joeverbout 0:ea44dc9ed014 304 __m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk3);
joeverbout 0:ea44dc9ed014 305
joeverbout 0:ea44dc9ed014 306 __m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk2);
joeverbout 0:ea44dc9ed014 307 __m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk2);
joeverbout 0:ea44dc9ed014 308 __m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk3);
joeverbout 0:ea44dc9ed014 309 __m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk3);
joeverbout 0:ea44dc9ed014 310
joeverbout 0:ea44dc9ed014 311 v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk2);
joeverbout 0:ea44dc9ed014 312 v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk2);
joeverbout 0:ea44dc9ed014 313 v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk3);
joeverbout 0:ea44dc9ed014 314 v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk3);
joeverbout 0:ea44dc9ed014 315 }
joeverbout 0:ea44dc9ed014 316
joeverbout 0:ea44dc9ed014 317 inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
joeverbout 0:ea44dc9ed014 318 __m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
joeverbout 0:ea44dc9ed014 319 {
joeverbout 0:ea44dc9ed014 320 __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_g1);
joeverbout 0:ea44dc9ed014 321 __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_g1);
joeverbout 0:ea44dc9ed014 322 __m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_b0);
joeverbout 0:ea44dc9ed014 323 __m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_b0);
joeverbout 0:ea44dc9ed014 324 __m128i layer1_chunk4 = _mm_unpacklo_epi16(v_g0, v_b1);
joeverbout 0:ea44dc9ed014 325 __m128i layer1_chunk5 = _mm_unpackhi_epi16(v_g0, v_b1);
joeverbout 0:ea44dc9ed014 326
joeverbout 0:ea44dc9ed014 327 __m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk3);
joeverbout 0:ea44dc9ed014 328 __m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk3);
joeverbout 0:ea44dc9ed014 329 __m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk4);
joeverbout 0:ea44dc9ed014 330 __m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk4);
joeverbout 0:ea44dc9ed014 331 __m128i layer2_chunk4 = _mm_unpacklo_epi16(layer1_chunk2, layer1_chunk5);
joeverbout 0:ea44dc9ed014 332 __m128i layer2_chunk5 = _mm_unpackhi_epi16(layer1_chunk2, layer1_chunk5);
joeverbout 0:ea44dc9ed014 333
joeverbout 0:ea44dc9ed014 334 __m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk3);
joeverbout 0:ea44dc9ed014 335 __m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk3);
joeverbout 0:ea44dc9ed014 336 __m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk4);
joeverbout 0:ea44dc9ed014 337 __m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk4);
joeverbout 0:ea44dc9ed014 338 __m128i layer3_chunk4 = _mm_unpacklo_epi16(layer2_chunk2, layer2_chunk5);
joeverbout 0:ea44dc9ed014 339 __m128i layer3_chunk5 = _mm_unpackhi_epi16(layer2_chunk2, layer2_chunk5);
joeverbout 0:ea44dc9ed014 340
joeverbout 0:ea44dc9ed014 341 v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk3);
joeverbout 0:ea44dc9ed014 342 v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk3);
joeverbout 0:ea44dc9ed014 343 v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk4);
joeverbout 0:ea44dc9ed014 344 v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk4);
joeverbout 0:ea44dc9ed014 345 v_b0 = _mm_unpacklo_epi16(layer3_chunk2, layer3_chunk5);
joeverbout 0:ea44dc9ed014 346 v_b1 = _mm_unpackhi_epi16(layer3_chunk2, layer3_chunk5);
joeverbout 0:ea44dc9ed014 347 }
joeverbout 0:ea44dc9ed014 348
joeverbout 0:ea44dc9ed014 349 inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
joeverbout 0:ea44dc9ed014 350 __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
joeverbout 0:ea44dc9ed014 351 {
joeverbout 0:ea44dc9ed014 352 __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_b0);
joeverbout 0:ea44dc9ed014 353 __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_b0);
joeverbout 0:ea44dc9ed014 354 __m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_b1);
joeverbout 0:ea44dc9ed014 355 __m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_b1);
joeverbout 0:ea44dc9ed014 356 __m128i layer1_chunk4 = _mm_unpacklo_epi16(v_g0, v_a0);
joeverbout 0:ea44dc9ed014 357 __m128i layer1_chunk5 = _mm_unpackhi_epi16(v_g0, v_a0);
joeverbout 0:ea44dc9ed014 358 __m128i layer1_chunk6 = _mm_unpacklo_epi16(v_g1, v_a1);
joeverbout 0:ea44dc9ed014 359 __m128i layer1_chunk7 = _mm_unpackhi_epi16(v_g1, v_a1);
joeverbout 0:ea44dc9ed014 360
joeverbout 0:ea44dc9ed014 361 __m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk4);
joeverbout 0:ea44dc9ed014 362 __m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk4);
joeverbout 0:ea44dc9ed014 363 __m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk5);
joeverbout 0:ea44dc9ed014 364 __m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk5);
joeverbout 0:ea44dc9ed014 365 __m128i layer2_chunk4 = _mm_unpacklo_epi16(layer1_chunk2, layer1_chunk6);
joeverbout 0:ea44dc9ed014 366 __m128i layer2_chunk5 = _mm_unpackhi_epi16(layer1_chunk2, layer1_chunk6);
joeverbout 0:ea44dc9ed014 367 __m128i layer2_chunk6 = _mm_unpacklo_epi16(layer1_chunk3, layer1_chunk7);
joeverbout 0:ea44dc9ed014 368 __m128i layer2_chunk7 = _mm_unpackhi_epi16(layer1_chunk3, layer1_chunk7);
joeverbout 0:ea44dc9ed014 369
joeverbout 0:ea44dc9ed014 370 __m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk4);
joeverbout 0:ea44dc9ed014 371 __m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk4);
joeverbout 0:ea44dc9ed014 372 __m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk5);
joeverbout 0:ea44dc9ed014 373 __m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk5);
joeverbout 0:ea44dc9ed014 374 __m128i layer3_chunk4 = _mm_unpacklo_epi16(layer2_chunk2, layer2_chunk6);
joeverbout 0:ea44dc9ed014 375 __m128i layer3_chunk5 = _mm_unpackhi_epi16(layer2_chunk2, layer2_chunk6);
joeverbout 0:ea44dc9ed014 376 __m128i layer3_chunk6 = _mm_unpacklo_epi16(layer2_chunk3, layer2_chunk7);
joeverbout 0:ea44dc9ed014 377 __m128i layer3_chunk7 = _mm_unpackhi_epi16(layer2_chunk3, layer2_chunk7);
joeverbout 0:ea44dc9ed014 378
joeverbout 0:ea44dc9ed014 379 v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk4);
joeverbout 0:ea44dc9ed014 380 v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk4);
joeverbout 0:ea44dc9ed014 381 v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk5);
joeverbout 0:ea44dc9ed014 382 v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk5);
joeverbout 0:ea44dc9ed014 383 v_b0 = _mm_unpacklo_epi16(layer3_chunk2, layer3_chunk6);
joeverbout 0:ea44dc9ed014 384 v_b1 = _mm_unpackhi_epi16(layer3_chunk2, layer3_chunk6);
joeverbout 0:ea44dc9ed014 385 v_a0 = _mm_unpacklo_epi16(layer3_chunk3, layer3_chunk7);
joeverbout 0:ea44dc9ed014 386 v_a1 = _mm_unpackhi_epi16(layer3_chunk3, layer3_chunk7);
joeverbout 0:ea44dc9ed014 387 }
joeverbout 0:ea44dc9ed014 388
joeverbout 0:ea44dc9ed014 389 #if CV_SSE4_1
joeverbout 0:ea44dc9ed014 390
joeverbout 0:ea44dc9ed014 391 inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
joeverbout 0:ea44dc9ed014 392 {
joeverbout 0:ea44dc9ed014 393 __m128i v_mask = _mm_set1_epi32(0x0000ffff);
joeverbout 0:ea44dc9ed014 394
joeverbout 0:ea44dc9ed014 395 __m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
joeverbout 0:ea44dc9ed014 396 __m128i layer3_chunk2 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16));
joeverbout 0:ea44dc9ed014 397 __m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
joeverbout 0:ea44dc9ed014 398 __m128i layer3_chunk3 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16));
joeverbout 0:ea44dc9ed014 399
joeverbout 0:ea44dc9ed014 400 __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
joeverbout 0:ea44dc9ed014 401 __m128i layer2_chunk2 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16));
joeverbout 0:ea44dc9ed014 402 __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
joeverbout 0:ea44dc9ed014 403 __m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16));
joeverbout 0:ea44dc9ed014 404
joeverbout 0:ea44dc9ed014 405 __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
joeverbout 0:ea44dc9ed014 406 __m128i layer1_chunk2 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16));
joeverbout 0:ea44dc9ed014 407 __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
joeverbout 0:ea44dc9ed014 408 __m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16));
joeverbout 0:ea44dc9ed014 409
joeverbout 0:ea44dc9ed014 410 v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
joeverbout 0:ea44dc9ed014 411 v_g0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16));
joeverbout 0:ea44dc9ed014 412 v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
joeverbout 0:ea44dc9ed014 413 v_g1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16));
joeverbout 0:ea44dc9ed014 414 }
joeverbout 0:ea44dc9ed014 415
joeverbout 0:ea44dc9ed014 416 inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
joeverbout 0:ea44dc9ed014 417 __m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
joeverbout 0:ea44dc9ed014 418 {
joeverbout 0:ea44dc9ed014 419 __m128i v_mask = _mm_set1_epi32(0x0000ffff);
joeverbout 0:ea44dc9ed014 420
joeverbout 0:ea44dc9ed014 421 __m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
joeverbout 0:ea44dc9ed014 422 __m128i layer3_chunk3 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16));
joeverbout 0:ea44dc9ed014 423 __m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
joeverbout 0:ea44dc9ed014 424 __m128i layer3_chunk4 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16));
joeverbout 0:ea44dc9ed014 425 __m128i layer3_chunk2 = _mm_packus_epi32(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
joeverbout 0:ea44dc9ed014 426 __m128i layer3_chunk5 = _mm_packus_epi32(_mm_srli_epi32(v_b0, 16), _mm_srli_epi32(v_b1, 16));
joeverbout 0:ea44dc9ed014 427
joeverbout 0:ea44dc9ed014 428 __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
joeverbout 0:ea44dc9ed014 429 __m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16));
joeverbout 0:ea44dc9ed014 430 __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
joeverbout 0:ea44dc9ed014 431 __m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16));
joeverbout 0:ea44dc9ed014 432 __m128i layer2_chunk2 = _mm_packus_epi32(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
joeverbout 0:ea44dc9ed014 433 __m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16));
joeverbout 0:ea44dc9ed014 434
joeverbout 0:ea44dc9ed014 435 __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
joeverbout 0:ea44dc9ed014 436 __m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16));
joeverbout 0:ea44dc9ed014 437 __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
joeverbout 0:ea44dc9ed014 438 __m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16));
joeverbout 0:ea44dc9ed014 439 __m128i layer1_chunk2 = _mm_packus_epi32(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
joeverbout 0:ea44dc9ed014 440 __m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16));
joeverbout 0:ea44dc9ed014 441
joeverbout 0:ea44dc9ed014 442 v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
joeverbout 0:ea44dc9ed014 443 v_g1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16));
joeverbout 0:ea44dc9ed014 444 v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
joeverbout 0:ea44dc9ed014 445 v_b0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16));
joeverbout 0:ea44dc9ed014 446 v_g0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
joeverbout 0:ea44dc9ed014 447 v_b1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk4, 16), _mm_srli_epi32(layer1_chunk5, 16));
joeverbout 0:ea44dc9ed014 448 }
joeverbout 0:ea44dc9ed014 449
joeverbout 0:ea44dc9ed014 450 inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
joeverbout 0:ea44dc9ed014 451 __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
joeverbout 0:ea44dc9ed014 452 {
joeverbout 0:ea44dc9ed014 453 __m128i v_mask = _mm_set1_epi32(0x0000ffff);
joeverbout 0:ea44dc9ed014 454
joeverbout 0:ea44dc9ed014 455 __m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
joeverbout 0:ea44dc9ed014 456 __m128i layer3_chunk4 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16));
joeverbout 0:ea44dc9ed014 457 __m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
joeverbout 0:ea44dc9ed014 458 __m128i layer3_chunk5 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16));
joeverbout 0:ea44dc9ed014 459 __m128i layer3_chunk2 = _mm_packus_epi32(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
joeverbout 0:ea44dc9ed014 460 __m128i layer3_chunk6 = _mm_packus_epi32(_mm_srli_epi32(v_b0, 16), _mm_srli_epi32(v_b1, 16));
joeverbout 0:ea44dc9ed014 461 __m128i layer3_chunk3 = _mm_packus_epi32(_mm_and_si128(v_a0, v_mask), _mm_and_si128(v_a1, v_mask));
joeverbout 0:ea44dc9ed014 462 __m128i layer3_chunk7 = _mm_packus_epi32(_mm_srli_epi32(v_a0, 16), _mm_srli_epi32(v_a1, 16));
joeverbout 0:ea44dc9ed014 463
joeverbout 0:ea44dc9ed014 464 __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
joeverbout 0:ea44dc9ed014 465 __m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16));
joeverbout 0:ea44dc9ed014 466 __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
joeverbout 0:ea44dc9ed014 467 __m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16));
joeverbout 0:ea44dc9ed014 468 __m128i layer2_chunk2 = _mm_packus_epi32(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
joeverbout 0:ea44dc9ed014 469 __m128i layer2_chunk6 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16));
joeverbout 0:ea44dc9ed014 470 __m128i layer2_chunk3 = _mm_packus_epi32(_mm_and_si128(layer3_chunk6, v_mask), _mm_and_si128(layer3_chunk7, v_mask));
joeverbout 0:ea44dc9ed014 471 __m128i layer2_chunk7 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk6, 16), _mm_srli_epi32(layer3_chunk7, 16));
joeverbout 0:ea44dc9ed014 472
joeverbout 0:ea44dc9ed014 473 __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
joeverbout 0:ea44dc9ed014 474 __m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16));
joeverbout 0:ea44dc9ed014 475 __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
joeverbout 0:ea44dc9ed014 476 __m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16));
joeverbout 0:ea44dc9ed014 477 __m128i layer1_chunk2 = _mm_packus_epi32(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
joeverbout 0:ea44dc9ed014 478 __m128i layer1_chunk6 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16));
joeverbout 0:ea44dc9ed014 479 __m128i layer1_chunk3 = _mm_packus_epi32(_mm_and_si128(layer2_chunk6, v_mask), _mm_and_si128(layer2_chunk7, v_mask));
joeverbout 0:ea44dc9ed014 480 __m128i layer1_chunk7 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk6, 16), _mm_srli_epi32(layer2_chunk7, 16));
joeverbout 0:ea44dc9ed014 481
joeverbout 0:ea44dc9ed014 482 v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
joeverbout 0:ea44dc9ed014 483 v_b0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16));
joeverbout 0:ea44dc9ed014 484 v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
joeverbout 0:ea44dc9ed014 485 v_b1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16));
joeverbout 0:ea44dc9ed014 486 v_g0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
joeverbout 0:ea44dc9ed014 487 v_a0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk4, 16), _mm_srli_epi32(layer1_chunk5, 16));
joeverbout 0:ea44dc9ed014 488 v_g1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk6, v_mask), _mm_and_si128(layer1_chunk7, v_mask));
joeverbout 0:ea44dc9ed014 489 v_a1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk6, 16), _mm_srli_epi32(layer1_chunk7, 16));
joeverbout 0:ea44dc9ed014 490 }
joeverbout 0:ea44dc9ed014 491
joeverbout 0:ea44dc9ed014 492 #endif // CV_SSE4_1
joeverbout 0:ea44dc9ed014 493
joeverbout 0:ea44dc9ed014 494 inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1)
joeverbout 0:ea44dc9ed014 495 {
joeverbout 0:ea44dc9ed014 496 __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_g0);
joeverbout 0:ea44dc9ed014 497 __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_g0);
joeverbout 0:ea44dc9ed014 498 __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_g1);
joeverbout 0:ea44dc9ed014 499 __m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_g1);
joeverbout 0:ea44dc9ed014 500
joeverbout 0:ea44dc9ed014 501 __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk2);
joeverbout 0:ea44dc9ed014 502 __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk2);
joeverbout 0:ea44dc9ed014 503 __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk3);
joeverbout 0:ea44dc9ed014 504 __m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk3);
joeverbout 0:ea44dc9ed014 505
joeverbout 0:ea44dc9ed014 506 v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk2);
joeverbout 0:ea44dc9ed014 507 v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk2);
joeverbout 0:ea44dc9ed014 508 v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk3);
joeverbout 0:ea44dc9ed014 509 v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk3);
joeverbout 0:ea44dc9ed014 510 }
joeverbout 0:ea44dc9ed014 511
joeverbout 0:ea44dc9ed014 512 inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0,
joeverbout 0:ea44dc9ed014 513 __m128 & v_g1, __m128 & v_b0, __m128 & v_b1)
joeverbout 0:ea44dc9ed014 514 {
joeverbout 0:ea44dc9ed014 515 __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_g1);
joeverbout 0:ea44dc9ed014 516 __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_g1);
joeverbout 0:ea44dc9ed014 517 __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_b0);
joeverbout 0:ea44dc9ed014 518 __m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_b0);
joeverbout 0:ea44dc9ed014 519 __m128 layer1_chunk4 = _mm_unpacklo_ps(v_g0, v_b1);
joeverbout 0:ea44dc9ed014 520 __m128 layer1_chunk5 = _mm_unpackhi_ps(v_g0, v_b1);
joeverbout 0:ea44dc9ed014 521
joeverbout 0:ea44dc9ed014 522 __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk3);
joeverbout 0:ea44dc9ed014 523 __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk3);
joeverbout 0:ea44dc9ed014 524 __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk4);
joeverbout 0:ea44dc9ed014 525 __m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk4);
joeverbout 0:ea44dc9ed014 526 __m128 layer2_chunk4 = _mm_unpacklo_ps(layer1_chunk2, layer1_chunk5);
joeverbout 0:ea44dc9ed014 527 __m128 layer2_chunk5 = _mm_unpackhi_ps(layer1_chunk2, layer1_chunk5);
joeverbout 0:ea44dc9ed014 528
joeverbout 0:ea44dc9ed014 529 v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk3);
joeverbout 0:ea44dc9ed014 530 v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk3);
joeverbout 0:ea44dc9ed014 531 v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk4);
joeverbout 0:ea44dc9ed014 532 v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk4);
joeverbout 0:ea44dc9ed014 533 v_b0 = _mm_unpacklo_ps(layer2_chunk2, layer2_chunk5);
joeverbout 0:ea44dc9ed014 534 v_b1 = _mm_unpackhi_ps(layer2_chunk2, layer2_chunk5);
joeverbout 0:ea44dc9ed014 535 }
joeverbout 0:ea44dc9ed014 536
joeverbout 0:ea44dc9ed014 537 inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1,
joeverbout 0:ea44dc9ed014 538 __m128 & v_b0, __m128 & v_b1, __m128 & v_a0, __m128 & v_a1)
joeverbout 0:ea44dc9ed014 539 {
joeverbout 0:ea44dc9ed014 540 __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_b0);
joeverbout 0:ea44dc9ed014 541 __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_b0);
joeverbout 0:ea44dc9ed014 542 __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_b1);
joeverbout 0:ea44dc9ed014 543 __m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_b1);
joeverbout 0:ea44dc9ed014 544 __m128 layer1_chunk4 = _mm_unpacklo_ps(v_g0, v_a0);
joeverbout 0:ea44dc9ed014 545 __m128 layer1_chunk5 = _mm_unpackhi_ps(v_g0, v_a0);
joeverbout 0:ea44dc9ed014 546 __m128 layer1_chunk6 = _mm_unpacklo_ps(v_g1, v_a1);
joeverbout 0:ea44dc9ed014 547 __m128 layer1_chunk7 = _mm_unpackhi_ps(v_g1, v_a1);
joeverbout 0:ea44dc9ed014 548
joeverbout 0:ea44dc9ed014 549 __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk4);
joeverbout 0:ea44dc9ed014 550 __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk4);
joeverbout 0:ea44dc9ed014 551 __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk5);
joeverbout 0:ea44dc9ed014 552 __m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk5);
joeverbout 0:ea44dc9ed014 553 __m128 layer2_chunk4 = _mm_unpacklo_ps(layer1_chunk2, layer1_chunk6);
joeverbout 0:ea44dc9ed014 554 __m128 layer2_chunk5 = _mm_unpackhi_ps(layer1_chunk2, layer1_chunk6);
joeverbout 0:ea44dc9ed014 555 __m128 layer2_chunk6 = _mm_unpacklo_ps(layer1_chunk3, layer1_chunk7);
joeverbout 0:ea44dc9ed014 556 __m128 layer2_chunk7 = _mm_unpackhi_ps(layer1_chunk3, layer1_chunk7);
joeverbout 0:ea44dc9ed014 557
joeverbout 0:ea44dc9ed014 558 v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk4);
joeverbout 0:ea44dc9ed014 559 v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk4);
joeverbout 0:ea44dc9ed014 560 v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk5);
joeverbout 0:ea44dc9ed014 561 v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk5);
joeverbout 0:ea44dc9ed014 562 v_b0 = _mm_unpacklo_ps(layer2_chunk2, layer2_chunk6);
joeverbout 0:ea44dc9ed014 563 v_b1 = _mm_unpackhi_ps(layer2_chunk2, layer2_chunk6);
joeverbout 0:ea44dc9ed014 564 v_a0 = _mm_unpacklo_ps(layer2_chunk3, layer2_chunk7);
joeverbout 0:ea44dc9ed014 565 v_a1 = _mm_unpackhi_ps(layer2_chunk3, layer2_chunk7);
joeverbout 0:ea44dc9ed014 566 }
joeverbout 0:ea44dc9ed014 567
joeverbout 0:ea44dc9ed014 568 inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1)
joeverbout 0:ea44dc9ed014 569 {
joeverbout 0:ea44dc9ed014 570 const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
joeverbout 0:ea44dc9ed014 571
joeverbout 0:ea44dc9ed014 572 __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo);
joeverbout 0:ea44dc9ed014 573 __m128 layer2_chunk2 = _mm_shuffle_ps(v_r0, v_r1, mask_hi);
joeverbout 0:ea44dc9ed014 574 __m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo);
joeverbout 0:ea44dc9ed014 575 __m128 layer2_chunk3 = _mm_shuffle_ps(v_g0, v_g1, mask_hi);
joeverbout 0:ea44dc9ed014 576
joeverbout 0:ea44dc9ed014 577 __m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo);
joeverbout 0:ea44dc9ed014 578 __m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi);
joeverbout 0:ea44dc9ed014 579 __m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo);
joeverbout 0:ea44dc9ed014 580 __m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi);
joeverbout 0:ea44dc9ed014 581
joeverbout 0:ea44dc9ed014 582 v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo);
joeverbout 0:ea44dc9ed014 583 v_g0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi);
joeverbout 0:ea44dc9ed014 584 v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo);
joeverbout 0:ea44dc9ed014 585 v_g1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi);
joeverbout 0:ea44dc9ed014 586 }
joeverbout 0:ea44dc9ed014 587
joeverbout 0:ea44dc9ed014 588 inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0,
joeverbout 0:ea44dc9ed014 589 __m128 & v_g1, __m128 & v_b0, __m128 & v_b1)
joeverbout 0:ea44dc9ed014 590 {
joeverbout 0:ea44dc9ed014 591 const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
joeverbout 0:ea44dc9ed014 592
joeverbout 0:ea44dc9ed014 593 __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo);
joeverbout 0:ea44dc9ed014 594 __m128 layer2_chunk3 = _mm_shuffle_ps(v_r0, v_r1, mask_hi);
joeverbout 0:ea44dc9ed014 595 __m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo);
joeverbout 0:ea44dc9ed014 596 __m128 layer2_chunk4 = _mm_shuffle_ps(v_g0, v_g1, mask_hi);
joeverbout 0:ea44dc9ed014 597 __m128 layer2_chunk2 = _mm_shuffle_ps(v_b0, v_b1, mask_lo);
joeverbout 0:ea44dc9ed014 598 __m128 layer2_chunk5 = _mm_shuffle_ps(v_b0, v_b1, mask_hi);
joeverbout 0:ea44dc9ed014 599
joeverbout 0:ea44dc9ed014 600 __m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo);
joeverbout 0:ea44dc9ed014 601 __m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi);
joeverbout 0:ea44dc9ed014 602 __m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo);
joeverbout 0:ea44dc9ed014 603 __m128 layer1_chunk4 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi);
joeverbout 0:ea44dc9ed014 604 __m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_lo);
joeverbout 0:ea44dc9ed014 605 __m128 layer1_chunk5 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_hi);
joeverbout 0:ea44dc9ed014 606
joeverbout 0:ea44dc9ed014 607 v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo);
joeverbout 0:ea44dc9ed014 608 v_g1 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi);
joeverbout 0:ea44dc9ed014 609 v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo);
joeverbout 0:ea44dc9ed014 610 v_b0 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi);
joeverbout 0:ea44dc9ed014 611 v_g0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_lo);
joeverbout 0:ea44dc9ed014 612 v_b1 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_hi);
joeverbout 0:ea44dc9ed014 613 }
joeverbout 0:ea44dc9ed014 614
joeverbout 0:ea44dc9ed014 615 inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1,
joeverbout 0:ea44dc9ed014 616 __m128 & v_b0, __m128 & v_b1, __m128 & v_a0, __m128 & v_a1)
joeverbout 0:ea44dc9ed014 617 {
joeverbout 0:ea44dc9ed014 618 const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
joeverbout 0:ea44dc9ed014 619
joeverbout 0:ea44dc9ed014 620 __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo);
joeverbout 0:ea44dc9ed014 621 __m128 layer2_chunk4 = _mm_shuffle_ps(v_r0, v_r1, mask_hi);
joeverbout 0:ea44dc9ed014 622 __m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo);
joeverbout 0:ea44dc9ed014 623 __m128 layer2_chunk5 = _mm_shuffle_ps(v_g0, v_g1, mask_hi);
joeverbout 0:ea44dc9ed014 624 __m128 layer2_chunk2 = _mm_shuffle_ps(v_b0, v_b1, mask_lo);
joeverbout 0:ea44dc9ed014 625 __m128 layer2_chunk6 = _mm_shuffle_ps(v_b0, v_b1, mask_hi);
joeverbout 0:ea44dc9ed014 626 __m128 layer2_chunk3 = _mm_shuffle_ps(v_a0, v_a1, mask_lo);
joeverbout 0:ea44dc9ed014 627 __m128 layer2_chunk7 = _mm_shuffle_ps(v_a0, v_a1, mask_hi);
joeverbout 0:ea44dc9ed014 628
joeverbout 0:ea44dc9ed014 629 __m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo);
joeverbout 0:ea44dc9ed014 630 __m128 layer1_chunk4 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi);
joeverbout 0:ea44dc9ed014 631 __m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo);
joeverbout 0:ea44dc9ed014 632 __m128 layer1_chunk5 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi);
joeverbout 0:ea44dc9ed014 633 __m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_lo);
joeverbout 0:ea44dc9ed014 634 __m128 layer1_chunk6 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_hi);
joeverbout 0:ea44dc9ed014 635 __m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk6, layer2_chunk7, mask_lo);
joeverbout 0:ea44dc9ed014 636 __m128 layer1_chunk7 = _mm_shuffle_ps(layer2_chunk6, layer2_chunk7, mask_hi);
joeverbout 0:ea44dc9ed014 637
joeverbout 0:ea44dc9ed014 638 v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo);
joeverbout 0:ea44dc9ed014 639 v_b0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi);
joeverbout 0:ea44dc9ed014 640 v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo);
joeverbout 0:ea44dc9ed014 641 v_b1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi);
joeverbout 0:ea44dc9ed014 642 v_g0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_lo);
joeverbout 0:ea44dc9ed014 643 v_a0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_hi);
joeverbout 0:ea44dc9ed014 644 v_g1 = _mm_shuffle_ps(layer1_chunk6, layer1_chunk7, mask_lo);
joeverbout 0:ea44dc9ed014 645 v_a1 = _mm_shuffle_ps(layer1_chunk6, layer1_chunk7, mask_hi);
joeverbout 0:ea44dc9ed014 646 }
joeverbout 0:ea44dc9ed014 647
joeverbout 0:ea44dc9ed014 648 #endif // CV_SSE2
joeverbout 0:ea44dc9ed014 649
joeverbout 0:ea44dc9ed014 650 //! @}
joeverbout 0:ea44dc9ed014 651
joeverbout 0:ea44dc9ed014 652 #endif //__OPENCV_CORE_SSE_UTILS_HPP__
joeverbout 0:ea44dc9ed014 653