Joe Verbout
/
main
opencv on mbed
opencv2/core/sse_utils.hpp@0:ea44dc9ed014, 2016-03-31 (annotated)
- Committer:
- joeverbout
- Date:
- Thu Mar 31 21:16:38 2016 +0000
- Revision:
- 0:ea44dc9ed014
OpenCV on mbed attempt
Who changed what in which revision?
User | Revision | Line number | New contents of line |
---|---|---|---|
joeverbout | 0:ea44dc9ed014 | 1 | /*M/////////////////////////////////////////////////////////////////////////////////////// |
joeverbout | 0:ea44dc9ed014 | 2 | // |
joeverbout | 0:ea44dc9ed014 | 3 | // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
joeverbout | 0:ea44dc9ed014 | 4 | // |
joeverbout | 0:ea44dc9ed014 | 5 | // By downloading, copying, installing or using the software you agree to this license. |
joeverbout | 0:ea44dc9ed014 | 6 | // If you do not agree to this license, do not download, install, |
joeverbout | 0:ea44dc9ed014 | 7 | // copy or use the software. |
joeverbout | 0:ea44dc9ed014 | 8 | // |
joeverbout | 0:ea44dc9ed014 | 9 | // |
joeverbout | 0:ea44dc9ed014 | 10 | // License Agreement |
joeverbout | 0:ea44dc9ed014 | 11 | // For Open Source Computer Vision Library |
joeverbout | 0:ea44dc9ed014 | 12 | // |
joeverbout | 0:ea44dc9ed014 | 13 | // Copyright (C) 2015, Itseez Inc., all rights reserved. |
joeverbout | 0:ea44dc9ed014 | 14 | // Third party copyrights are property of their respective owners. |
joeverbout | 0:ea44dc9ed014 | 15 | // |
joeverbout | 0:ea44dc9ed014 | 16 | // Redistribution and use in source and binary forms, with or without modification, |
joeverbout | 0:ea44dc9ed014 | 17 | // are permitted provided that the following conditions are met: |
joeverbout | 0:ea44dc9ed014 | 18 | // |
joeverbout | 0:ea44dc9ed014 | 19 | // * Redistribution's of source code must retain the above copyright notice, |
joeverbout | 0:ea44dc9ed014 | 20 | // this list of conditions and the following disclaimer. |
joeverbout | 0:ea44dc9ed014 | 21 | // |
joeverbout | 0:ea44dc9ed014 | 22 | // * Redistribution's in binary form must reproduce the above copyright notice, |
joeverbout | 0:ea44dc9ed014 | 23 | // this list of conditions and the following disclaimer in the documentation |
joeverbout | 0:ea44dc9ed014 | 24 | // and/or other materials provided with the distribution. |
joeverbout | 0:ea44dc9ed014 | 25 | // |
joeverbout | 0:ea44dc9ed014 | 26 | // * The name of the copyright holders may not be used to endorse or promote products |
joeverbout | 0:ea44dc9ed014 | 27 | // derived from this software without specific prior written permission. |
joeverbout | 0:ea44dc9ed014 | 28 | // |
joeverbout | 0:ea44dc9ed014 | 29 | // This software is provided by the copyright holders and contributors "as is" and |
joeverbout | 0:ea44dc9ed014 | 30 | // any express or implied warranties, including, but not limited to, the implied |
joeverbout | 0:ea44dc9ed014 | 31 | // warranties of merchantability and fitness for a particular purpose are disclaimed. |
joeverbout | 0:ea44dc9ed014 | 32 | // In no event shall the Intel Corporation or contributors be liable for any direct, |
joeverbout | 0:ea44dc9ed014 | 33 | // indirect, incidental, special, exemplary, or consequential damages |
joeverbout | 0:ea44dc9ed014 | 34 | // (including, but not limited to, procurement of substitute goods or services; |
joeverbout | 0:ea44dc9ed014 | 35 | // loss of use, data, or profits; or business interruption) however caused |
joeverbout | 0:ea44dc9ed014 | 36 | // and on any theory of liability, whether in contract, strict liability, |
joeverbout | 0:ea44dc9ed014 | 37 | // or tort (including negligence or otherwise) arising in any way out of |
joeverbout | 0:ea44dc9ed014 | 38 | // the use of this software, even if advised of the possibility of such damage. |
joeverbout | 0:ea44dc9ed014 | 39 | // |
joeverbout | 0:ea44dc9ed014 | 40 | //M*/ |
joeverbout | 0:ea44dc9ed014 | 41 | |
joeverbout | 0:ea44dc9ed014 | 42 | #ifndef __OPENCV_CORE_SSE_UTILS_HPP__ |
joeverbout | 0:ea44dc9ed014 | 43 | #define __OPENCV_CORE_SSE_UTILS_HPP__ |
joeverbout | 0:ea44dc9ed014 | 44 | |
joeverbout | 0:ea44dc9ed014 | 45 | #ifndef __cplusplus |
joeverbout | 0:ea44dc9ed014 | 46 | # error sse_utils.hpp header must be compiled as C++ |
joeverbout | 0:ea44dc9ed014 | 47 | #endif |
joeverbout | 0:ea44dc9ed014 | 48 | |
joeverbout | 0:ea44dc9ed014 | 49 | #include "opencv2/core/cvdef.h" |
joeverbout | 0:ea44dc9ed014 | 50 | |
joeverbout | 0:ea44dc9ed014 | 51 | //! @addtogroup core_utils_sse |
joeverbout | 0:ea44dc9ed014 | 52 | //! @{ |
joeverbout | 0:ea44dc9ed014 | 53 | |
joeverbout | 0:ea44dc9ed014 | 54 | #if CV_SSE2 |
joeverbout | 0:ea44dc9ed014 | 55 | |
joeverbout | 0:ea44dc9ed014 | 56 | inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1) |
joeverbout | 0:ea44dc9ed014 | 57 | { |
joeverbout | 0:ea44dc9ed014 | 58 | __m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_g0); |
joeverbout | 0:ea44dc9ed014 | 59 | __m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_g0); |
joeverbout | 0:ea44dc9ed014 | 60 | __m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_g1); |
joeverbout | 0:ea44dc9ed014 | 61 | __m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_g1); |
joeverbout | 0:ea44dc9ed014 | 62 | |
joeverbout | 0:ea44dc9ed014 | 63 | __m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk2); |
joeverbout | 0:ea44dc9ed014 | 64 | __m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk2); |
joeverbout | 0:ea44dc9ed014 | 65 | __m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk3); |
joeverbout | 0:ea44dc9ed014 | 66 | __m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk3); |
joeverbout | 0:ea44dc9ed014 | 67 | |
joeverbout | 0:ea44dc9ed014 | 68 | __m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk2); |
joeverbout | 0:ea44dc9ed014 | 69 | __m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk2); |
joeverbout | 0:ea44dc9ed014 | 70 | __m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk3); |
joeverbout | 0:ea44dc9ed014 | 71 | __m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk3); |
joeverbout | 0:ea44dc9ed014 | 72 | |
joeverbout | 0:ea44dc9ed014 | 73 | __m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk2); |
joeverbout | 0:ea44dc9ed014 | 74 | __m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk2); |
joeverbout | 0:ea44dc9ed014 | 75 | __m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk3); |
joeverbout | 0:ea44dc9ed014 | 76 | __m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk3); |
joeverbout | 0:ea44dc9ed014 | 77 | |
joeverbout | 0:ea44dc9ed014 | 78 | v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk2); |
joeverbout | 0:ea44dc9ed014 | 79 | v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk2); |
joeverbout | 0:ea44dc9ed014 | 80 | v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk3); |
joeverbout | 0:ea44dc9ed014 | 81 | v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk3); |
joeverbout | 0:ea44dc9ed014 | 82 | } |
joeverbout | 0:ea44dc9ed014 | 83 | |
joeverbout | 0:ea44dc9ed014 | 84 | inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, |
joeverbout | 0:ea44dc9ed014 | 85 | __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) |
joeverbout | 0:ea44dc9ed014 | 86 | { |
joeverbout | 0:ea44dc9ed014 | 87 | __m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_g1); |
joeverbout | 0:ea44dc9ed014 | 88 | __m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_g1); |
joeverbout | 0:ea44dc9ed014 | 89 | __m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_b0); |
joeverbout | 0:ea44dc9ed014 | 90 | __m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_b0); |
joeverbout | 0:ea44dc9ed014 | 91 | __m128i layer1_chunk4 = _mm_unpacklo_epi8(v_g0, v_b1); |
joeverbout | 0:ea44dc9ed014 | 92 | __m128i layer1_chunk5 = _mm_unpackhi_epi8(v_g0, v_b1); |
joeverbout | 0:ea44dc9ed014 | 93 | |
joeverbout | 0:ea44dc9ed014 | 94 | __m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk3); |
joeverbout | 0:ea44dc9ed014 | 95 | __m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk3); |
joeverbout | 0:ea44dc9ed014 | 96 | __m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk4); |
joeverbout | 0:ea44dc9ed014 | 97 | __m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk4); |
joeverbout | 0:ea44dc9ed014 | 98 | __m128i layer2_chunk4 = _mm_unpacklo_epi8(layer1_chunk2, layer1_chunk5); |
joeverbout | 0:ea44dc9ed014 | 99 | __m128i layer2_chunk5 = _mm_unpackhi_epi8(layer1_chunk2, layer1_chunk5); |
joeverbout | 0:ea44dc9ed014 | 100 | |
joeverbout | 0:ea44dc9ed014 | 101 | __m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk3); |
joeverbout | 0:ea44dc9ed014 | 102 | __m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk3); |
joeverbout | 0:ea44dc9ed014 | 103 | __m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk4); |
joeverbout | 0:ea44dc9ed014 | 104 | __m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk4); |
joeverbout | 0:ea44dc9ed014 | 105 | __m128i layer3_chunk4 = _mm_unpacklo_epi8(layer2_chunk2, layer2_chunk5); |
joeverbout | 0:ea44dc9ed014 | 106 | __m128i layer3_chunk5 = _mm_unpackhi_epi8(layer2_chunk2, layer2_chunk5); |
joeverbout | 0:ea44dc9ed014 | 107 | |
joeverbout | 0:ea44dc9ed014 | 108 | __m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk3); |
joeverbout | 0:ea44dc9ed014 | 109 | __m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk3); |
joeverbout | 0:ea44dc9ed014 | 110 | __m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk4); |
joeverbout | 0:ea44dc9ed014 | 111 | __m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk4); |
joeverbout | 0:ea44dc9ed014 | 112 | __m128i layer4_chunk4 = _mm_unpacklo_epi8(layer3_chunk2, layer3_chunk5); |
joeverbout | 0:ea44dc9ed014 | 113 | __m128i layer4_chunk5 = _mm_unpackhi_epi8(layer3_chunk2, layer3_chunk5); |
joeverbout | 0:ea44dc9ed014 | 114 | |
joeverbout | 0:ea44dc9ed014 | 115 | v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk3); |
joeverbout | 0:ea44dc9ed014 | 116 | v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk3); |
joeverbout | 0:ea44dc9ed014 | 117 | v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk4); |
joeverbout | 0:ea44dc9ed014 | 118 | v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk4); |
joeverbout | 0:ea44dc9ed014 | 119 | v_b0 = _mm_unpacklo_epi8(layer4_chunk2, layer4_chunk5); |
joeverbout | 0:ea44dc9ed014 | 120 | v_b1 = _mm_unpackhi_epi8(layer4_chunk2, layer4_chunk5); |
joeverbout | 0:ea44dc9ed014 | 121 | } |
joeverbout | 0:ea44dc9ed014 | 122 | |
joeverbout | 0:ea44dc9ed014 | 123 | inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, |
joeverbout | 0:ea44dc9ed014 | 124 | __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1) |
joeverbout | 0:ea44dc9ed014 | 125 | { |
joeverbout | 0:ea44dc9ed014 | 126 | __m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_b0); |
joeverbout | 0:ea44dc9ed014 | 127 | __m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_b0); |
joeverbout | 0:ea44dc9ed014 | 128 | __m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_b1); |
joeverbout | 0:ea44dc9ed014 | 129 | __m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_b1); |
joeverbout | 0:ea44dc9ed014 | 130 | __m128i layer1_chunk4 = _mm_unpacklo_epi8(v_g0, v_a0); |
joeverbout | 0:ea44dc9ed014 | 131 | __m128i layer1_chunk5 = _mm_unpackhi_epi8(v_g0, v_a0); |
joeverbout | 0:ea44dc9ed014 | 132 | __m128i layer1_chunk6 = _mm_unpacklo_epi8(v_g1, v_a1); |
joeverbout | 0:ea44dc9ed014 | 133 | __m128i layer1_chunk7 = _mm_unpackhi_epi8(v_g1, v_a1); |
joeverbout | 0:ea44dc9ed014 | 134 | |
joeverbout | 0:ea44dc9ed014 | 135 | __m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk4); |
joeverbout | 0:ea44dc9ed014 | 136 | __m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk4); |
joeverbout | 0:ea44dc9ed014 | 137 | __m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk5); |
joeverbout | 0:ea44dc9ed014 | 138 | __m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk5); |
joeverbout | 0:ea44dc9ed014 | 139 | __m128i layer2_chunk4 = _mm_unpacklo_epi8(layer1_chunk2, layer1_chunk6); |
joeverbout | 0:ea44dc9ed014 | 140 | __m128i layer2_chunk5 = _mm_unpackhi_epi8(layer1_chunk2, layer1_chunk6); |
joeverbout | 0:ea44dc9ed014 | 141 | __m128i layer2_chunk6 = _mm_unpacklo_epi8(layer1_chunk3, layer1_chunk7); |
joeverbout | 0:ea44dc9ed014 | 142 | __m128i layer2_chunk7 = _mm_unpackhi_epi8(layer1_chunk3, layer1_chunk7); |
joeverbout | 0:ea44dc9ed014 | 143 | |
joeverbout | 0:ea44dc9ed014 | 144 | __m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk4); |
joeverbout | 0:ea44dc9ed014 | 145 | __m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk4); |
joeverbout | 0:ea44dc9ed014 | 146 | __m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk5); |
joeverbout | 0:ea44dc9ed014 | 147 | __m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk5); |
joeverbout | 0:ea44dc9ed014 | 148 | __m128i layer3_chunk4 = _mm_unpacklo_epi8(layer2_chunk2, layer2_chunk6); |
joeverbout | 0:ea44dc9ed014 | 149 | __m128i layer3_chunk5 = _mm_unpackhi_epi8(layer2_chunk2, layer2_chunk6); |
joeverbout | 0:ea44dc9ed014 | 150 | __m128i layer3_chunk6 = _mm_unpacklo_epi8(layer2_chunk3, layer2_chunk7); |
joeverbout | 0:ea44dc9ed014 | 151 | __m128i layer3_chunk7 = _mm_unpackhi_epi8(layer2_chunk3, layer2_chunk7); |
joeverbout | 0:ea44dc9ed014 | 152 | |
joeverbout | 0:ea44dc9ed014 | 153 | __m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk4); |
joeverbout | 0:ea44dc9ed014 | 154 | __m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk4); |
joeverbout | 0:ea44dc9ed014 | 155 | __m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk5); |
joeverbout | 0:ea44dc9ed014 | 156 | __m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk5); |
joeverbout | 0:ea44dc9ed014 | 157 | __m128i layer4_chunk4 = _mm_unpacklo_epi8(layer3_chunk2, layer3_chunk6); |
joeverbout | 0:ea44dc9ed014 | 158 | __m128i layer4_chunk5 = _mm_unpackhi_epi8(layer3_chunk2, layer3_chunk6); |
joeverbout | 0:ea44dc9ed014 | 159 | __m128i layer4_chunk6 = _mm_unpacklo_epi8(layer3_chunk3, layer3_chunk7); |
joeverbout | 0:ea44dc9ed014 | 160 | __m128i layer4_chunk7 = _mm_unpackhi_epi8(layer3_chunk3, layer3_chunk7); |
joeverbout | 0:ea44dc9ed014 | 161 | |
joeverbout | 0:ea44dc9ed014 | 162 | v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk4); |
joeverbout | 0:ea44dc9ed014 | 163 | v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk4); |
joeverbout | 0:ea44dc9ed014 | 164 | v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk5); |
joeverbout | 0:ea44dc9ed014 | 165 | v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk5); |
joeverbout | 0:ea44dc9ed014 | 166 | v_b0 = _mm_unpacklo_epi8(layer4_chunk2, layer4_chunk6); |
joeverbout | 0:ea44dc9ed014 | 167 | v_b1 = _mm_unpackhi_epi8(layer4_chunk2, layer4_chunk6); |
joeverbout | 0:ea44dc9ed014 | 168 | v_a0 = _mm_unpacklo_epi8(layer4_chunk3, layer4_chunk7); |
joeverbout | 0:ea44dc9ed014 | 169 | v_a1 = _mm_unpackhi_epi8(layer4_chunk3, layer4_chunk7); |
joeverbout | 0:ea44dc9ed014 | 170 | } |
joeverbout | 0:ea44dc9ed014 | 171 | |
joeverbout | 0:ea44dc9ed014 | 172 | inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1) |
joeverbout | 0:ea44dc9ed014 | 173 | { |
joeverbout | 0:ea44dc9ed014 | 174 | __m128i v_mask = _mm_set1_epi16(0x00ff); |
joeverbout | 0:ea44dc9ed014 | 175 | |
joeverbout | 0:ea44dc9ed014 | 176 | __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 177 | __m128i layer4_chunk2 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8)); |
joeverbout | 0:ea44dc9ed014 | 178 | __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 179 | __m128i layer4_chunk3 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8)); |
joeverbout | 0:ea44dc9ed014 | 180 | |
joeverbout | 0:ea44dc9ed014 | 181 | __m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 182 | __m128i layer3_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8)); |
joeverbout | 0:ea44dc9ed014 | 183 | __m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 184 | __m128i layer3_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8)); |
joeverbout | 0:ea44dc9ed014 | 185 | |
joeverbout | 0:ea44dc9ed014 | 186 | __m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 187 | __m128i layer2_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8)); |
joeverbout | 0:ea44dc9ed014 | 188 | __m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 189 | __m128i layer2_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8)); |
joeverbout | 0:ea44dc9ed014 | 190 | |
joeverbout | 0:ea44dc9ed014 | 191 | __m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 192 | __m128i layer1_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8)); |
joeverbout | 0:ea44dc9ed014 | 193 | __m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 194 | __m128i layer1_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8)); |
joeverbout | 0:ea44dc9ed014 | 195 | |
joeverbout | 0:ea44dc9ed014 | 196 | v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 197 | v_g0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8)); |
joeverbout | 0:ea44dc9ed014 | 198 | v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 199 | v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8)); |
joeverbout | 0:ea44dc9ed014 | 200 | } |
joeverbout | 0:ea44dc9ed014 | 201 | |
joeverbout | 0:ea44dc9ed014 | 202 | inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, |
joeverbout | 0:ea44dc9ed014 | 203 | __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) |
joeverbout | 0:ea44dc9ed014 | 204 | { |
joeverbout | 0:ea44dc9ed014 | 205 | __m128i v_mask = _mm_set1_epi16(0x00ff); |
joeverbout | 0:ea44dc9ed014 | 206 | |
joeverbout | 0:ea44dc9ed014 | 207 | __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 208 | __m128i layer4_chunk3 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8)); |
joeverbout | 0:ea44dc9ed014 | 209 | __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 210 | __m128i layer4_chunk4 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8)); |
joeverbout | 0:ea44dc9ed014 | 211 | __m128i layer4_chunk2 = _mm_packus_epi16(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 212 | __m128i layer4_chunk5 = _mm_packus_epi16(_mm_srli_epi16(v_b0, 8), _mm_srli_epi16(v_b1, 8)); |
joeverbout | 0:ea44dc9ed014 | 213 | |
joeverbout | 0:ea44dc9ed014 | 214 | __m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 215 | __m128i layer3_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8)); |
joeverbout | 0:ea44dc9ed014 | 216 | __m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 217 | __m128i layer3_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8)); |
joeverbout | 0:ea44dc9ed014 | 218 | __m128i layer3_chunk2 = _mm_packus_epi16(_mm_and_si128(layer4_chunk4, v_mask), _mm_and_si128(layer4_chunk5, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 219 | __m128i layer3_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk4, 8), _mm_srli_epi16(layer4_chunk5, 8)); |
joeverbout | 0:ea44dc9ed014 | 220 | |
joeverbout | 0:ea44dc9ed014 | 221 | __m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 222 | __m128i layer2_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8)); |
joeverbout | 0:ea44dc9ed014 | 223 | __m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 224 | __m128i layer2_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8)); |
joeverbout | 0:ea44dc9ed014 | 225 | __m128i layer2_chunk2 = _mm_packus_epi16(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 226 | __m128i layer2_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk4, 8), _mm_srli_epi16(layer3_chunk5, 8)); |
joeverbout | 0:ea44dc9ed014 | 227 | |
joeverbout | 0:ea44dc9ed014 | 228 | __m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 229 | __m128i layer1_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8)); |
joeverbout | 0:ea44dc9ed014 | 230 | __m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 231 | __m128i layer1_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8)); |
joeverbout | 0:ea44dc9ed014 | 232 | __m128i layer1_chunk2 = _mm_packus_epi16(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 233 | __m128i layer1_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk4, 8), _mm_srli_epi16(layer2_chunk5, 8)); |
joeverbout | 0:ea44dc9ed014 | 234 | |
joeverbout | 0:ea44dc9ed014 | 235 | v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 236 | v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8)); |
joeverbout | 0:ea44dc9ed014 | 237 | v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 238 | v_b0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8)); |
joeverbout | 0:ea44dc9ed014 | 239 | v_g0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 240 | v_b1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk4, 8), _mm_srli_epi16(layer1_chunk5, 8)); |
joeverbout | 0:ea44dc9ed014 | 241 | } |
joeverbout | 0:ea44dc9ed014 | 242 | |
joeverbout | 0:ea44dc9ed014 | 243 | inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, |
joeverbout | 0:ea44dc9ed014 | 244 | __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1) |
joeverbout | 0:ea44dc9ed014 | 245 | { |
joeverbout | 0:ea44dc9ed014 | 246 | __m128i v_mask = _mm_set1_epi16(0x00ff); |
joeverbout | 0:ea44dc9ed014 | 247 | |
joeverbout | 0:ea44dc9ed014 | 248 | __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 249 | __m128i layer4_chunk4 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8)); |
joeverbout | 0:ea44dc9ed014 | 250 | __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 251 | __m128i layer4_chunk5 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8)); |
joeverbout | 0:ea44dc9ed014 | 252 | __m128i layer4_chunk2 = _mm_packus_epi16(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 253 | __m128i layer4_chunk6 = _mm_packus_epi16(_mm_srli_epi16(v_b0, 8), _mm_srli_epi16(v_b1, 8)); |
joeverbout | 0:ea44dc9ed014 | 254 | __m128i layer4_chunk3 = _mm_packus_epi16(_mm_and_si128(v_a0, v_mask), _mm_and_si128(v_a1, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 255 | __m128i layer4_chunk7 = _mm_packus_epi16(_mm_srli_epi16(v_a0, 8), _mm_srli_epi16(v_a1, 8)); |
joeverbout | 0:ea44dc9ed014 | 256 | |
joeverbout | 0:ea44dc9ed014 | 257 | __m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 258 | __m128i layer3_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8)); |
joeverbout | 0:ea44dc9ed014 | 259 | __m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 260 | __m128i layer3_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8)); |
joeverbout | 0:ea44dc9ed014 | 261 | __m128i layer3_chunk2 = _mm_packus_epi16(_mm_and_si128(layer4_chunk4, v_mask), _mm_and_si128(layer4_chunk5, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 262 | __m128i layer3_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk4, 8), _mm_srli_epi16(layer4_chunk5, 8)); |
joeverbout | 0:ea44dc9ed014 | 263 | __m128i layer3_chunk3 = _mm_packus_epi16(_mm_and_si128(layer4_chunk6, v_mask), _mm_and_si128(layer4_chunk7, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 264 | __m128i layer3_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk6, 8), _mm_srli_epi16(layer4_chunk7, 8)); |
joeverbout | 0:ea44dc9ed014 | 265 | |
joeverbout | 0:ea44dc9ed014 | 266 | __m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 267 | __m128i layer2_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8)); |
joeverbout | 0:ea44dc9ed014 | 268 | __m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 269 | __m128i layer2_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8)); |
joeverbout | 0:ea44dc9ed014 | 270 | __m128i layer2_chunk2 = _mm_packus_epi16(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 271 | __m128i layer2_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk4, 8), _mm_srli_epi16(layer3_chunk5, 8)); |
joeverbout | 0:ea44dc9ed014 | 272 | __m128i layer2_chunk3 = _mm_packus_epi16(_mm_and_si128(layer3_chunk6, v_mask), _mm_and_si128(layer3_chunk7, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 273 | __m128i layer2_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk6, 8), _mm_srli_epi16(layer3_chunk7, 8)); |
joeverbout | 0:ea44dc9ed014 | 274 | |
joeverbout | 0:ea44dc9ed014 | 275 | __m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 276 | __m128i layer1_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8)); |
joeverbout | 0:ea44dc9ed014 | 277 | __m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 278 | __m128i layer1_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8)); |
joeverbout | 0:ea44dc9ed014 | 279 | __m128i layer1_chunk2 = _mm_packus_epi16(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 280 | __m128i layer1_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk4, 8), _mm_srli_epi16(layer2_chunk5, 8)); |
joeverbout | 0:ea44dc9ed014 | 281 | __m128i layer1_chunk3 = _mm_packus_epi16(_mm_and_si128(layer2_chunk6, v_mask), _mm_and_si128(layer2_chunk7, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 282 | __m128i layer1_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk6, 8), _mm_srli_epi16(layer2_chunk7, 8)); |
joeverbout | 0:ea44dc9ed014 | 283 | |
joeverbout | 0:ea44dc9ed014 | 284 | v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 285 | v_b0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8)); |
joeverbout | 0:ea44dc9ed014 | 286 | v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 287 | v_b1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8)); |
joeverbout | 0:ea44dc9ed014 | 288 | v_g0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 289 | v_a0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk4, 8), _mm_srli_epi16(layer1_chunk5, 8)); |
joeverbout | 0:ea44dc9ed014 | 290 | v_g1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk6, v_mask), _mm_and_si128(layer1_chunk7, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 291 | v_a1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk6, 8), _mm_srli_epi16(layer1_chunk7, 8)); |
joeverbout | 0:ea44dc9ed014 | 292 | } |
joeverbout | 0:ea44dc9ed014 | 293 | |
joeverbout | 0:ea44dc9ed014 | 294 | inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1) |
joeverbout | 0:ea44dc9ed014 | 295 | { |
joeverbout | 0:ea44dc9ed014 | 296 | __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_g0); |
joeverbout | 0:ea44dc9ed014 | 297 | __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_g0); |
joeverbout | 0:ea44dc9ed014 | 298 | __m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_g1); |
joeverbout | 0:ea44dc9ed014 | 299 | __m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_g1); |
joeverbout | 0:ea44dc9ed014 | 300 | |
joeverbout | 0:ea44dc9ed014 | 301 | __m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk2); |
joeverbout | 0:ea44dc9ed014 | 302 | __m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk2); |
joeverbout | 0:ea44dc9ed014 | 303 | __m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk3); |
joeverbout | 0:ea44dc9ed014 | 304 | __m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk3); |
joeverbout | 0:ea44dc9ed014 | 305 | |
joeverbout | 0:ea44dc9ed014 | 306 | __m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk2); |
joeverbout | 0:ea44dc9ed014 | 307 | __m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk2); |
joeverbout | 0:ea44dc9ed014 | 308 | __m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk3); |
joeverbout | 0:ea44dc9ed014 | 309 | __m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk3); |
joeverbout | 0:ea44dc9ed014 | 310 | |
joeverbout | 0:ea44dc9ed014 | 311 | v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk2); |
joeverbout | 0:ea44dc9ed014 | 312 | v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk2); |
joeverbout | 0:ea44dc9ed014 | 313 | v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk3); |
joeverbout | 0:ea44dc9ed014 | 314 | v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk3); |
joeverbout | 0:ea44dc9ed014 | 315 | } |
joeverbout | 0:ea44dc9ed014 | 316 | |
joeverbout | 0:ea44dc9ed014 | 317 | inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, |
joeverbout | 0:ea44dc9ed014 | 318 | __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) |
joeverbout | 0:ea44dc9ed014 | 319 | { |
joeverbout | 0:ea44dc9ed014 | 320 | __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_g1); |
joeverbout | 0:ea44dc9ed014 | 321 | __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_g1); |
joeverbout | 0:ea44dc9ed014 | 322 | __m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_b0); |
joeverbout | 0:ea44dc9ed014 | 323 | __m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_b0); |
joeverbout | 0:ea44dc9ed014 | 324 | __m128i layer1_chunk4 = _mm_unpacklo_epi16(v_g0, v_b1); |
joeverbout | 0:ea44dc9ed014 | 325 | __m128i layer1_chunk5 = _mm_unpackhi_epi16(v_g0, v_b1); |
joeverbout | 0:ea44dc9ed014 | 326 | |
joeverbout | 0:ea44dc9ed014 | 327 | __m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk3); |
joeverbout | 0:ea44dc9ed014 | 328 | __m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk3); |
joeverbout | 0:ea44dc9ed014 | 329 | __m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk4); |
joeverbout | 0:ea44dc9ed014 | 330 | __m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk4); |
joeverbout | 0:ea44dc9ed014 | 331 | __m128i layer2_chunk4 = _mm_unpacklo_epi16(layer1_chunk2, layer1_chunk5); |
joeverbout | 0:ea44dc9ed014 | 332 | __m128i layer2_chunk5 = _mm_unpackhi_epi16(layer1_chunk2, layer1_chunk5); |
joeverbout | 0:ea44dc9ed014 | 333 | |
joeverbout | 0:ea44dc9ed014 | 334 | __m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk3); |
joeverbout | 0:ea44dc9ed014 | 335 | __m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk3); |
joeverbout | 0:ea44dc9ed014 | 336 | __m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk4); |
joeverbout | 0:ea44dc9ed014 | 337 | __m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk4); |
joeverbout | 0:ea44dc9ed014 | 338 | __m128i layer3_chunk4 = _mm_unpacklo_epi16(layer2_chunk2, layer2_chunk5); |
joeverbout | 0:ea44dc9ed014 | 339 | __m128i layer3_chunk5 = _mm_unpackhi_epi16(layer2_chunk2, layer2_chunk5); |
joeverbout | 0:ea44dc9ed014 | 340 | |
joeverbout | 0:ea44dc9ed014 | 341 | v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk3); |
joeverbout | 0:ea44dc9ed014 | 342 | v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk3); |
joeverbout | 0:ea44dc9ed014 | 343 | v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk4); |
joeverbout | 0:ea44dc9ed014 | 344 | v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk4); |
joeverbout | 0:ea44dc9ed014 | 345 | v_b0 = _mm_unpacklo_epi16(layer3_chunk2, layer3_chunk5); |
joeverbout | 0:ea44dc9ed014 | 346 | v_b1 = _mm_unpackhi_epi16(layer3_chunk2, layer3_chunk5); |
joeverbout | 0:ea44dc9ed014 | 347 | } |
joeverbout | 0:ea44dc9ed014 | 348 | |
joeverbout | 0:ea44dc9ed014 | 349 | inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, |
joeverbout | 0:ea44dc9ed014 | 350 | __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1) |
joeverbout | 0:ea44dc9ed014 | 351 | { |
joeverbout | 0:ea44dc9ed014 | 352 | __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_b0); |
joeverbout | 0:ea44dc9ed014 | 353 | __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_b0); |
joeverbout | 0:ea44dc9ed014 | 354 | __m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_b1); |
joeverbout | 0:ea44dc9ed014 | 355 | __m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_b1); |
joeverbout | 0:ea44dc9ed014 | 356 | __m128i layer1_chunk4 = _mm_unpacklo_epi16(v_g0, v_a0); |
joeverbout | 0:ea44dc9ed014 | 357 | __m128i layer1_chunk5 = _mm_unpackhi_epi16(v_g0, v_a0); |
joeverbout | 0:ea44dc9ed014 | 358 | __m128i layer1_chunk6 = _mm_unpacklo_epi16(v_g1, v_a1); |
joeverbout | 0:ea44dc9ed014 | 359 | __m128i layer1_chunk7 = _mm_unpackhi_epi16(v_g1, v_a1); |
joeverbout | 0:ea44dc9ed014 | 360 | |
joeverbout | 0:ea44dc9ed014 | 361 | __m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk4); |
joeverbout | 0:ea44dc9ed014 | 362 | __m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk4); |
joeverbout | 0:ea44dc9ed014 | 363 | __m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk5); |
joeverbout | 0:ea44dc9ed014 | 364 | __m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk5); |
joeverbout | 0:ea44dc9ed014 | 365 | __m128i layer2_chunk4 = _mm_unpacklo_epi16(layer1_chunk2, layer1_chunk6); |
joeverbout | 0:ea44dc9ed014 | 366 | __m128i layer2_chunk5 = _mm_unpackhi_epi16(layer1_chunk2, layer1_chunk6); |
joeverbout | 0:ea44dc9ed014 | 367 | __m128i layer2_chunk6 = _mm_unpacklo_epi16(layer1_chunk3, layer1_chunk7); |
joeverbout | 0:ea44dc9ed014 | 368 | __m128i layer2_chunk7 = _mm_unpackhi_epi16(layer1_chunk3, layer1_chunk7); |
joeverbout | 0:ea44dc9ed014 | 369 | |
joeverbout | 0:ea44dc9ed014 | 370 | __m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk4); |
joeverbout | 0:ea44dc9ed014 | 371 | __m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk4); |
joeverbout | 0:ea44dc9ed014 | 372 | __m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk5); |
joeverbout | 0:ea44dc9ed014 | 373 | __m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk5); |
joeverbout | 0:ea44dc9ed014 | 374 | __m128i layer3_chunk4 = _mm_unpacklo_epi16(layer2_chunk2, layer2_chunk6); |
joeverbout | 0:ea44dc9ed014 | 375 | __m128i layer3_chunk5 = _mm_unpackhi_epi16(layer2_chunk2, layer2_chunk6); |
joeverbout | 0:ea44dc9ed014 | 376 | __m128i layer3_chunk6 = _mm_unpacklo_epi16(layer2_chunk3, layer2_chunk7); |
joeverbout | 0:ea44dc9ed014 | 377 | __m128i layer3_chunk7 = _mm_unpackhi_epi16(layer2_chunk3, layer2_chunk7); |
joeverbout | 0:ea44dc9ed014 | 378 | |
joeverbout | 0:ea44dc9ed014 | 379 | v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk4); |
joeverbout | 0:ea44dc9ed014 | 380 | v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk4); |
joeverbout | 0:ea44dc9ed014 | 381 | v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk5); |
joeverbout | 0:ea44dc9ed014 | 382 | v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk5); |
joeverbout | 0:ea44dc9ed014 | 383 | v_b0 = _mm_unpacklo_epi16(layer3_chunk2, layer3_chunk6); |
joeverbout | 0:ea44dc9ed014 | 384 | v_b1 = _mm_unpackhi_epi16(layer3_chunk2, layer3_chunk6); |
joeverbout | 0:ea44dc9ed014 | 385 | v_a0 = _mm_unpacklo_epi16(layer3_chunk3, layer3_chunk7); |
joeverbout | 0:ea44dc9ed014 | 386 | v_a1 = _mm_unpackhi_epi16(layer3_chunk3, layer3_chunk7); |
joeverbout | 0:ea44dc9ed014 | 387 | } |
joeverbout | 0:ea44dc9ed014 | 388 | |
joeverbout | 0:ea44dc9ed014 | 389 | #if CV_SSE4_1 |
joeverbout | 0:ea44dc9ed014 | 390 | |
joeverbout | 0:ea44dc9ed014 | 391 | inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1) |
joeverbout | 0:ea44dc9ed014 | 392 | { |
joeverbout | 0:ea44dc9ed014 | 393 | __m128i v_mask = _mm_set1_epi32(0x0000ffff); |
joeverbout | 0:ea44dc9ed014 | 394 | |
joeverbout | 0:ea44dc9ed014 | 395 | __m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 396 | __m128i layer3_chunk2 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16)); |
joeverbout | 0:ea44dc9ed014 | 397 | __m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 398 | __m128i layer3_chunk3 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16)); |
joeverbout | 0:ea44dc9ed014 | 399 | |
joeverbout | 0:ea44dc9ed014 | 400 | __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 401 | __m128i layer2_chunk2 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16)); |
joeverbout | 0:ea44dc9ed014 | 402 | __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 403 | __m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16)); |
joeverbout | 0:ea44dc9ed014 | 404 | |
joeverbout | 0:ea44dc9ed014 | 405 | __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 406 | __m128i layer1_chunk2 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16)); |
joeverbout | 0:ea44dc9ed014 | 407 | __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 408 | __m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16)); |
joeverbout | 0:ea44dc9ed014 | 409 | |
joeverbout | 0:ea44dc9ed014 | 410 | v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 411 | v_g0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16)); |
joeverbout | 0:ea44dc9ed014 | 412 | v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 413 | v_g1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16)); |
joeverbout | 0:ea44dc9ed014 | 414 | } |
joeverbout | 0:ea44dc9ed014 | 415 | |
joeverbout | 0:ea44dc9ed014 | 416 | inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, |
joeverbout | 0:ea44dc9ed014 | 417 | __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) |
joeverbout | 0:ea44dc9ed014 | 418 | { |
joeverbout | 0:ea44dc9ed014 | 419 | __m128i v_mask = _mm_set1_epi32(0x0000ffff); |
joeverbout | 0:ea44dc9ed014 | 420 | |
joeverbout | 0:ea44dc9ed014 | 421 | __m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 422 | __m128i layer3_chunk3 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16)); |
joeverbout | 0:ea44dc9ed014 | 423 | __m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 424 | __m128i layer3_chunk4 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16)); |
joeverbout | 0:ea44dc9ed014 | 425 | __m128i layer3_chunk2 = _mm_packus_epi32(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 426 | __m128i layer3_chunk5 = _mm_packus_epi32(_mm_srli_epi32(v_b0, 16), _mm_srli_epi32(v_b1, 16)); |
joeverbout | 0:ea44dc9ed014 | 427 | |
joeverbout | 0:ea44dc9ed014 | 428 | __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 429 | __m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16)); |
joeverbout | 0:ea44dc9ed014 | 430 | __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 431 | __m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16)); |
joeverbout | 0:ea44dc9ed014 | 432 | __m128i layer2_chunk2 = _mm_packus_epi32(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 433 | __m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16)); |
joeverbout | 0:ea44dc9ed014 | 434 | |
joeverbout | 0:ea44dc9ed014 | 435 | __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 436 | __m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16)); |
joeverbout | 0:ea44dc9ed014 | 437 | __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 438 | __m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16)); |
joeverbout | 0:ea44dc9ed014 | 439 | __m128i layer1_chunk2 = _mm_packus_epi32(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 440 | __m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16)); |
joeverbout | 0:ea44dc9ed014 | 441 | |
joeverbout | 0:ea44dc9ed014 | 442 | v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 443 | v_g1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16)); |
joeverbout | 0:ea44dc9ed014 | 444 | v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 445 | v_b0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16)); |
joeverbout | 0:ea44dc9ed014 | 446 | v_g0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 447 | v_b1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk4, 16), _mm_srli_epi32(layer1_chunk5, 16)); |
joeverbout | 0:ea44dc9ed014 | 448 | } |
joeverbout | 0:ea44dc9ed014 | 449 | |
joeverbout | 0:ea44dc9ed014 | 450 | inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, |
joeverbout | 0:ea44dc9ed014 | 451 | __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1) |
joeverbout | 0:ea44dc9ed014 | 452 | { |
joeverbout | 0:ea44dc9ed014 | 453 | __m128i v_mask = _mm_set1_epi32(0x0000ffff); |
joeverbout | 0:ea44dc9ed014 | 454 | |
joeverbout | 0:ea44dc9ed014 | 455 | __m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 456 | __m128i layer3_chunk4 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16)); |
joeverbout | 0:ea44dc9ed014 | 457 | __m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 458 | __m128i layer3_chunk5 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16)); |
joeverbout | 0:ea44dc9ed014 | 459 | __m128i layer3_chunk2 = _mm_packus_epi32(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 460 | __m128i layer3_chunk6 = _mm_packus_epi32(_mm_srli_epi32(v_b0, 16), _mm_srli_epi32(v_b1, 16)); |
joeverbout | 0:ea44dc9ed014 | 461 | __m128i layer3_chunk3 = _mm_packus_epi32(_mm_and_si128(v_a0, v_mask), _mm_and_si128(v_a1, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 462 | __m128i layer3_chunk7 = _mm_packus_epi32(_mm_srli_epi32(v_a0, 16), _mm_srli_epi32(v_a1, 16)); |
joeverbout | 0:ea44dc9ed014 | 463 | |
joeverbout | 0:ea44dc9ed014 | 464 | __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 465 | __m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16)); |
joeverbout | 0:ea44dc9ed014 | 466 | __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 467 | __m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16)); |
joeverbout | 0:ea44dc9ed014 | 468 | __m128i layer2_chunk2 = _mm_packus_epi32(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 469 | __m128i layer2_chunk6 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16)); |
joeverbout | 0:ea44dc9ed014 | 470 | __m128i layer2_chunk3 = _mm_packus_epi32(_mm_and_si128(layer3_chunk6, v_mask), _mm_and_si128(layer3_chunk7, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 471 | __m128i layer2_chunk7 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk6, 16), _mm_srli_epi32(layer3_chunk7, 16)); |
joeverbout | 0:ea44dc9ed014 | 472 | |
joeverbout | 0:ea44dc9ed014 | 473 | __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 474 | __m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16)); |
joeverbout | 0:ea44dc9ed014 | 475 | __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 476 | __m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16)); |
joeverbout | 0:ea44dc9ed014 | 477 | __m128i layer1_chunk2 = _mm_packus_epi32(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 478 | __m128i layer1_chunk6 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16)); |
joeverbout | 0:ea44dc9ed014 | 479 | __m128i layer1_chunk3 = _mm_packus_epi32(_mm_and_si128(layer2_chunk6, v_mask), _mm_and_si128(layer2_chunk7, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 480 | __m128i layer1_chunk7 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk6, 16), _mm_srli_epi32(layer2_chunk7, 16)); |
joeverbout | 0:ea44dc9ed014 | 481 | |
joeverbout | 0:ea44dc9ed014 | 482 | v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 483 | v_b0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16)); |
joeverbout | 0:ea44dc9ed014 | 484 | v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 485 | v_b1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16)); |
joeverbout | 0:ea44dc9ed014 | 486 | v_g0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 487 | v_a0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk4, 16), _mm_srli_epi32(layer1_chunk5, 16)); |
joeverbout | 0:ea44dc9ed014 | 488 | v_g1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk6, v_mask), _mm_and_si128(layer1_chunk7, v_mask)); |
joeverbout | 0:ea44dc9ed014 | 489 | v_a1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk6, 16), _mm_srli_epi32(layer1_chunk7, 16)); |
joeverbout | 0:ea44dc9ed014 | 490 | } |
joeverbout | 0:ea44dc9ed014 | 491 | |
joeverbout | 0:ea44dc9ed014 | 492 | #endif // CV_SSE4_1 |
joeverbout | 0:ea44dc9ed014 | 493 | |
joeverbout | 0:ea44dc9ed014 | 494 | inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1) |
joeverbout | 0:ea44dc9ed014 | 495 | { |
joeverbout | 0:ea44dc9ed014 | 496 | __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_g0); |
joeverbout | 0:ea44dc9ed014 | 497 | __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_g0); |
joeverbout | 0:ea44dc9ed014 | 498 | __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_g1); |
joeverbout | 0:ea44dc9ed014 | 499 | __m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_g1); |
joeverbout | 0:ea44dc9ed014 | 500 | |
joeverbout | 0:ea44dc9ed014 | 501 | __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk2); |
joeverbout | 0:ea44dc9ed014 | 502 | __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk2); |
joeverbout | 0:ea44dc9ed014 | 503 | __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk3); |
joeverbout | 0:ea44dc9ed014 | 504 | __m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk3); |
joeverbout | 0:ea44dc9ed014 | 505 | |
joeverbout | 0:ea44dc9ed014 | 506 | v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk2); |
joeverbout | 0:ea44dc9ed014 | 507 | v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk2); |
joeverbout | 0:ea44dc9ed014 | 508 | v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk3); |
joeverbout | 0:ea44dc9ed014 | 509 | v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk3); |
joeverbout | 0:ea44dc9ed014 | 510 | } |
joeverbout | 0:ea44dc9ed014 | 511 | |
joeverbout | 0:ea44dc9ed014 | 512 | inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, |
joeverbout | 0:ea44dc9ed014 | 513 | __m128 & v_g1, __m128 & v_b0, __m128 & v_b1) |
joeverbout | 0:ea44dc9ed014 | 514 | { |
joeverbout | 0:ea44dc9ed014 | 515 | __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_g1); |
joeverbout | 0:ea44dc9ed014 | 516 | __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_g1); |
joeverbout | 0:ea44dc9ed014 | 517 | __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_b0); |
joeverbout | 0:ea44dc9ed014 | 518 | __m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_b0); |
joeverbout | 0:ea44dc9ed014 | 519 | __m128 layer1_chunk4 = _mm_unpacklo_ps(v_g0, v_b1); |
joeverbout | 0:ea44dc9ed014 | 520 | __m128 layer1_chunk5 = _mm_unpackhi_ps(v_g0, v_b1); |
joeverbout | 0:ea44dc9ed014 | 521 | |
joeverbout | 0:ea44dc9ed014 | 522 | __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk3); |
joeverbout | 0:ea44dc9ed014 | 523 | __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk3); |
joeverbout | 0:ea44dc9ed014 | 524 | __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk4); |
joeverbout | 0:ea44dc9ed014 | 525 | __m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk4); |
joeverbout | 0:ea44dc9ed014 | 526 | __m128 layer2_chunk4 = _mm_unpacklo_ps(layer1_chunk2, layer1_chunk5); |
joeverbout | 0:ea44dc9ed014 | 527 | __m128 layer2_chunk5 = _mm_unpackhi_ps(layer1_chunk2, layer1_chunk5); |
joeverbout | 0:ea44dc9ed014 | 528 | |
joeverbout | 0:ea44dc9ed014 | 529 | v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk3); |
joeverbout | 0:ea44dc9ed014 | 530 | v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk3); |
joeverbout | 0:ea44dc9ed014 | 531 | v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk4); |
joeverbout | 0:ea44dc9ed014 | 532 | v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk4); |
joeverbout | 0:ea44dc9ed014 | 533 | v_b0 = _mm_unpacklo_ps(layer2_chunk2, layer2_chunk5); |
joeverbout | 0:ea44dc9ed014 | 534 | v_b1 = _mm_unpackhi_ps(layer2_chunk2, layer2_chunk5); |
joeverbout | 0:ea44dc9ed014 | 535 | } |
joeverbout | 0:ea44dc9ed014 | 536 | |
joeverbout | 0:ea44dc9ed014 | 537 | inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1, |
joeverbout | 0:ea44dc9ed014 | 538 | __m128 & v_b0, __m128 & v_b1, __m128 & v_a0, __m128 & v_a1) |
joeverbout | 0:ea44dc9ed014 | 539 | { |
joeverbout | 0:ea44dc9ed014 | 540 | __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_b0); |
joeverbout | 0:ea44dc9ed014 | 541 | __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_b0); |
joeverbout | 0:ea44dc9ed014 | 542 | __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_b1); |
joeverbout | 0:ea44dc9ed014 | 543 | __m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_b1); |
joeverbout | 0:ea44dc9ed014 | 544 | __m128 layer1_chunk4 = _mm_unpacklo_ps(v_g0, v_a0); |
joeverbout | 0:ea44dc9ed014 | 545 | __m128 layer1_chunk5 = _mm_unpackhi_ps(v_g0, v_a0); |
joeverbout | 0:ea44dc9ed014 | 546 | __m128 layer1_chunk6 = _mm_unpacklo_ps(v_g1, v_a1); |
joeverbout | 0:ea44dc9ed014 | 547 | __m128 layer1_chunk7 = _mm_unpackhi_ps(v_g1, v_a1); |
joeverbout | 0:ea44dc9ed014 | 548 | |
joeverbout | 0:ea44dc9ed014 | 549 | __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk4); |
joeverbout | 0:ea44dc9ed014 | 550 | __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk4); |
joeverbout | 0:ea44dc9ed014 | 551 | __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk5); |
joeverbout | 0:ea44dc9ed014 | 552 | __m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk5); |
joeverbout | 0:ea44dc9ed014 | 553 | __m128 layer2_chunk4 = _mm_unpacklo_ps(layer1_chunk2, layer1_chunk6); |
joeverbout | 0:ea44dc9ed014 | 554 | __m128 layer2_chunk5 = _mm_unpackhi_ps(layer1_chunk2, layer1_chunk6); |
joeverbout | 0:ea44dc9ed014 | 555 | __m128 layer2_chunk6 = _mm_unpacklo_ps(layer1_chunk3, layer1_chunk7); |
joeverbout | 0:ea44dc9ed014 | 556 | __m128 layer2_chunk7 = _mm_unpackhi_ps(layer1_chunk3, layer1_chunk7); |
joeverbout | 0:ea44dc9ed014 | 557 | |
joeverbout | 0:ea44dc9ed014 | 558 | v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk4); |
joeverbout | 0:ea44dc9ed014 | 559 | v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk4); |
joeverbout | 0:ea44dc9ed014 | 560 | v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk5); |
joeverbout | 0:ea44dc9ed014 | 561 | v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk5); |
joeverbout | 0:ea44dc9ed014 | 562 | v_b0 = _mm_unpacklo_ps(layer2_chunk2, layer2_chunk6); |
joeverbout | 0:ea44dc9ed014 | 563 | v_b1 = _mm_unpackhi_ps(layer2_chunk2, layer2_chunk6); |
joeverbout | 0:ea44dc9ed014 | 564 | v_a0 = _mm_unpacklo_ps(layer2_chunk3, layer2_chunk7); |
joeverbout | 0:ea44dc9ed014 | 565 | v_a1 = _mm_unpackhi_ps(layer2_chunk3, layer2_chunk7); |
joeverbout | 0:ea44dc9ed014 | 566 | } |
joeverbout | 0:ea44dc9ed014 | 567 | |
joeverbout | 0:ea44dc9ed014 | 568 | inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1) |
joeverbout | 0:ea44dc9ed014 | 569 | { |
joeverbout | 0:ea44dc9ed014 | 570 | const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1); |
joeverbout | 0:ea44dc9ed014 | 571 | |
joeverbout | 0:ea44dc9ed014 | 572 | __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo); |
joeverbout | 0:ea44dc9ed014 | 573 | __m128 layer2_chunk2 = _mm_shuffle_ps(v_r0, v_r1, mask_hi); |
joeverbout | 0:ea44dc9ed014 | 574 | __m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo); |
joeverbout | 0:ea44dc9ed014 | 575 | __m128 layer2_chunk3 = _mm_shuffle_ps(v_g0, v_g1, mask_hi); |
joeverbout | 0:ea44dc9ed014 | 576 | |
joeverbout | 0:ea44dc9ed014 | 577 | __m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo); |
joeverbout | 0:ea44dc9ed014 | 578 | __m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi); |
joeverbout | 0:ea44dc9ed014 | 579 | __m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo); |
joeverbout | 0:ea44dc9ed014 | 580 | __m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi); |
joeverbout | 0:ea44dc9ed014 | 581 | |
joeverbout | 0:ea44dc9ed014 | 582 | v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo); |
joeverbout | 0:ea44dc9ed014 | 583 | v_g0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi); |
joeverbout | 0:ea44dc9ed014 | 584 | v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo); |
joeverbout | 0:ea44dc9ed014 | 585 | v_g1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi); |
joeverbout | 0:ea44dc9ed014 | 586 | } |
joeverbout | 0:ea44dc9ed014 | 587 | |
joeverbout | 0:ea44dc9ed014 | 588 | inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, |
joeverbout | 0:ea44dc9ed014 | 589 | __m128 & v_g1, __m128 & v_b0, __m128 & v_b1) |
joeverbout | 0:ea44dc9ed014 | 590 | { |
joeverbout | 0:ea44dc9ed014 | 591 | const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1); |
joeverbout | 0:ea44dc9ed014 | 592 | |
joeverbout | 0:ea44dc9ed014 | 593 | __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo); |
joeverbout | 0:ea44dc9ed014 | 594 | __m128 layer2_chunk3 = _mm_shuffle_ps(v_r0, v_r1, mask_hi); |
joeverbout | 0:ea44dc9ed014 | 595 | __m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo); |
joeverbout | 0:ea44dc9ed014 | 596 | __m128 layer2_chunk4 = _mm_shuffle_ps(v_g0, v_g1, mask_hi); |
joeverbout | 0:ea44dc9ed014 | 597 | __m128 layer2_chunk2 = _mm_shuffle_ps(v_b0, v_b1, mask_lo); |
joeverbout | 0:ea44dc9ed014 | 598 | __m128 layer2_chunk5 = _mm_shuffle_ps(v_b0, v_b1, mask_hi); |
joeverbout | 0:ea44dc9ed014 | 599 | |
joeverbout | 0:ea44dc9ed014 | 600 | __m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo); |
joeverbout | 0:ea44dc9ed014 | 601 | __m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi); |
joeverbout | 0:ea44dc9ed014 | 602 | __m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo); |
joeverbout | 0:ea44dc9ed014 | 603 | __m128 layer1_chunk4 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi); |
joeverbout | 0:ea44dc9ed014 | 604 | __m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_lo); |
joeverbout | 0:ea44dc9ed014 | 605 | __m128 layer1_chunk5 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_hi); |
joeverbout | 0:ea44dc9ed014 | 606 | |
joeverbout | 0:ea44dc9ed014 | 607 | v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo); |
joeverbout | 0:ea44dc9ed014 | 608 | v_g1 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi); |
joeverbout | 0:ea44dc9ed014 | 609 | v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo); |
joeverbout | 0:ea44dc9ed014 | 610 | v_b0 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi); |
joeverbout | 0:ea44dc9ed014 | 611 | v_g0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_lo); |
joeverbout | 0:ea44dc9ed014 | 612 | v_b1 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_hi); |
joeverbout | 0:ea44dc9ed014 | 613 | } |
joeverbout | 0:ea44dc9ed014 | 614 | |
joeverbout | 0:ea44dc9ed014 | 615 | inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1, |
joeverbout | 0:ea44dc9ed014 | 616 | __m128 & v_b0, __m128 & v_b1, __m128 & v_a0, __m128 & v_a1) |
joeverbout | 0:ea44dc9ed014 | 617 | { |
joeverbout | 0:ea44dc9ed014 | 618 | const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1); |
joeverbout | 0:ea44dc9ed014 | 619 | |
joeverbout | 0:ea44dc9ed014 | 620 | __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo); |
joeverbout | 0:ea44dc9ed014 | 621 | __m128 layer2_chunk4 = _mm_shuffle_ps(v_r0, v_r1, mask_hi); |
joeverbout | 0:ea44dc9ed014 | 622 | __m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo); |
joeverbout | 0:ea44dc9ed014 | 623 | __m128 layer2_chunk5 = _mm_shuffle_ps(v_g0, v_g1, mask_hi); |
joeverbout | 0:ea44dc9ed014 | 624 | __m128 layer2_chunk2 = _mm_shuffle_ps(v_b0, v_b1, mask_lo); |
joeverbout | 0:ea44dc9ed014 | 625 | __m128 layer2_chunk6 = _mm_shuffle_ps(v_b0, v_b1, mask_hi); |
joeverbout | 0:ea44dc9ed014 | 626 | __m128 layer2_chunk3 = _mm_shuffle_ps(v_a0, v_a1, mask_lo); |
joeverbout | 0:ea44dc9ed014 | 627 | __m128 layer2_chunk7 = _mm_shuffle_ps(v_a0, v_a1, mask_hi); |
joeverbout | 0:ea44dc9ed014 | 628 | |
joeverbout | 0:ea44dc9ed014 | 629 | __m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo); |
joeverbout | 0:ea44dc9ed014 | 630 | __m128 layer1_chunk4 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi); |
joeverbout | 0:ea44dc9ed014 | 631 | __m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo); |
joeverbout | 0:ea44dc9ed014 | 632 | __m128 layer1_chunk5 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi); |
joeverbout | 0:ea44dc9ed014 | 633 | __m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_lo); |
joeverbout | 0:ea44dc9ed014 | 634 | __m128 layer1_chunk6 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_hi); |
joeverbout | 0:ea44dc9ed014 | 635 | __m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk6, layer2_chunk7, mask_lo); |
joeverbout | 0:ea44dc9ed014 | 636 | __m128 layer1_chunk7 = _mm_shuffle_ps(layer2_chunk6, layer2_chunk7, mask_hi); |
joeverbout | 0:ea44dc9ed014 | 637 | |
joeverbout | 0:ea44dc9ed014 | 638 | v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo); |
joeverbout | 0:ea44dc9ed014 | 639 | v_b0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi); |
joeverbout | 0:ea44dc9ed014 | 640 | v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo); |
joeverbout | 0:ea44dc9ed014 | 641 | v_b1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi); |
joeverbout | 0:ea44dc9ed014 | 642 | v_g0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_lo); |
joeverbout | 0:ea44dc9ed014 | 643 | v_a0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_hi); |
joeverbout | 0:ea44dc9ed014 | 644 | v_g1 = _mm_shuffle_ps(layer1_chunk6, layer1_chunk7, mask_lo); |
joeverbout | 0:ea44dc9ed014 | 645 | v_a1 = _mm_shuffle_ps(layer1_chunk6, layer1_chunk7, mask_hi); |
joeverbout | 0:ea44dc9ed014 | 646 | } |
joeverbout | 0:ea44dc9ed014 | 647 | |
joeverbout | 0:ea44dc9ed014 | 648 | #endif // CV_SSE2 |
joeverbout | 0:ea44dc9ed014 | 649 | |
joeverbout | 0:ea44dc9ed014 | 650 | //! @} |
joeverbout | 0:ea44dc9ed014 | 651 | |
joeverbout | 0:ea44dc9ed014 | 652 | #endif //__OPENCV_CORE_SSE_UTILS_HPP__ |
joeverbout | 0:ea44dc9ed014 | 653 |