openCV library for Renesas RZ/A

Dependents:   RZ_A2M_Mbed_samples

Committer:
RyoheiHagimoto
Date:
Fri Jan 29 04:53:38 2021 +0000
Revision:
0:0e0631af0305
copied from https://github.com/d-kato/opencv-lib.

Who changed what in which revision?

UserRevisionLine numberNew contents of line
RyoheiHagimoto 0:0e0631af0305 1 /*M///////////////////////////////////////////////////////////////////////////////////////
RyoheiHagimoto 0:0e0631af0305 2 //
RyoheiHagimoto 0:0e0631af0305 3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
RyoheiHagimoto 0:0e0631af0305 4 //
RyoheiHagimoto 0:0e0631af0305 5 // By downloading, copying, installing or using the software you agree to this license.
RyoheiHagimoto 0:0e0631af0305 6 // If you do not agree to this license, do not download, install,
RyoheiHagimoto 0:0e0631af0305 7 // copy or use the software.
RyoheiHagimoto 0:0e0631af0305 8 //
RyoheiHagimoto 0:0e0631af0305 9 //
RyoheiHagimoto 0:0e0631af0305 10 // License Agreement
RyoheiHagimoto 0:0e0631af0305 11 // For Open Source Computer Vision Library
RyoheiHagimoto 0:0e0631af0305 12 //
RyoheiHagimoto 0:0e0631af0305 13 // Copyright (C) 2015, Itseez Inc., all rights reserved.
RyoheiHagimoto 0:0e0631af0305 14 // Third party copyrights are property of their respective owners.
RyoheiHagimoto 0:0e0631af0305 15 //
RyoheiHagimoto 0:0e0631af0305 16 // Redistribution and use in source and binary forms, with or without modification,
RyoheiHagimoto 0:0e0631af0305 17 // are permitted provided that the following conditions are met:
RyoheiHagimoto 0:0e0631af0305 18 //
RyoheiHagimoto 0:0e0631af0305 19 // * Redistribution's of source code must retain the above copyright notice,
RyoheiHagimoto 0:0e0631af0305 20 // this list of conditions and the following disclaimer.
RyoheiHagimoto 0:0e0631af0305 21 //
RyoheiHagimoto 0:0e0631af0305 22 // * Redistribution's in binary form must reproduce the above copyright notice,
RyoheiHagimoto 0:0e0631af0305 23 // this list of conditions and the following disclaimer in the documentation
RyoheiHagimoto 0:0e0631af0305 24 // and/or other materials provided with the distribution.
RyoheiHagimoto 0:0e0631af0305 25 //
RyoheiHagimoto 0:0e0631af0305 26 // * The name of the copyright holders may not be used to endorse or promote products
RyoheiHagimoto 0:0e0631af0305 27 // derived from this software without specific prior written permission.
RyoheiHagimoto 0:0e0631af0305 28 //
RyoheiHagimoto 0:0e0631af0305 29 // This software is provided by the copyright holders and contributors "as is" and
RyoheiHagimoto 0:0e0631af0305 30 // any express or implied warranties, including, but not limited to, the implied
RyoheiHagimoto 0:0e0631af0305 31 // warranties of merchantability and fitness for a particular purpose are disclaimed.
RyoheiHagimoto 0:0e0631af0305 32 // In no event shall the Intel Corporation or contributors be liable for any direct,
RyoheiHagimoto 0:0e0631af0305 33 // indirect, incidental, special, exemplary, or consequential damages
RyoheiHagimoto 0:0e0631af0305 34 // (including, but not limited to, procurement of substitute goods or services;
RyoheiHagimoto 0:0e0631af0305 35 // loss of use, data, or profits; or business interruption) however caused
RyoheiHagimoto 0:0e0631af0305 36 // and on any theory of liability, whether in contract, strict liability,
RyoheiHagimoto 0:0e0631af0305 37 // or tort (including negligence or otherwise) arising in any way out of
RyoheiHagimoto 0:0e0631af0305 38 // the use of this software, even if advised of the possibility of such damage.
RyoheiHagimoto 0:0e0631af0305 39 //
RyoheiHagimoto 0:0e0631af0305 40 //M*/
RyoheiHagimoto 0:0e0631af0305 41
RyoheiHagimoto 0:0e0631af0305 42 #ifndef OPENCV_CORE_SSE_UTILS_HPP
RyoheiHagimoto 0:0e0631af0305 43 #define OPENCV_CORE_SSE_UTILS_HPP
RyoheiHagimoto 0:0e0631af0305 44
RyoheiHagimoto 0:0e0631af0305 45 #ifndef __cplusplus
RyoheiHagimoto 0:0e0631af0305 46 # error sse_utils.hpp header must be compiled as C++
RyoheiHagimoto 0:0e0631af0305 47 #endif
RyoheiHagimoto 0:0e0631af0305 48
RyoheiHagimoto 0:0e0631af0305 49 #include "opencv2/core/cvdef.h"
RyoheiHagimoto 0:0e0631af0305 50
RyoheiHagimoto 0:0e0631af0305 51 //! @addtogroup core_utils_sse
RyoheiHagimoto 0:0e0631af0305 52 //! @{
RyoheiHagimoto 0:0e0631af0305 53
RyoheiHagimoto 0:0e0631af0305 54 #if CV_SSE2
RyoheiHagimoto 0:0e0631af0305 55
RyoheiHagimoto 0:0e0631af0305 56 inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
RyoheiHagimoto 0:0e0631af0305 57 {
RyoheiHagimoto 0:0e0631af0305 58 __m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_g0);
RyoheiHagimoto 0:0e0631af0305 59 __m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_g0);
RyoheiHagimoto 0:0e0631af0305 60 __m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_g1);
RyoheiHagimoto 0:0e0631af0305 61 __m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_g1);
RyoheiHagimoto 0:0e0631af0305 62
RyoheiHagimoto 0:0e0631af0305 63 __m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk2);
RyoheiHagimoto 0:0e0631af0305 64 __m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk2);
RyoheiHagimoto 0:0e0631af0305 65 __m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk3);
RyoheiHagimoto 0:0e0631af0305 66 __m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk3);
RyoheiHagimoto 0:0e0631af0305 67
RyoheiHagimoto 0:0e0631af0305 68 __m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk2);
RyoheiHagimoto 0:0e0631af0305 69 __m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk2);
RyoheiHagimoto 0:0e0631af0305 70 __m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk3);
RyoheiHagimoto 0:0e0631af0305 71 __m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk3);
RyoheiHagimoto 0:0e0631af0305 72
RyoheiHagimoto 0:0e0631af0305 73 __m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk2);
RyoheiHagimoto 0:0e0631af0305 74 __m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk2);
RyoheiHagimoto 0:0e0631af0305 75 __m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk3);
RyoheiHagimoto 0:0e0631af0305 76 __m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk3);
RyoheiHagimoto 0:0e0631af0305 77
RyoheiHagimoto 0:0e0631af0305 78 v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk2);
RyoheiHagimoto 0:0e0631af0305 79 v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk2);
RyoheiHagimoto 0:0e0631af0305 80 v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk3);
RyoheiHagimoto 0:0e0631af0305 81 v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk3);
RyoheiHagimoto 0:0e0631af0305 82 }
RyoheiHagimoto 0:0e0631af0305 83
RyoheiHagimoto 0:0e0631af0305 84 inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
RyoheiHagimoto 0:0e0631af0305 85 __m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
RyoheiHagimoto 0:0e0631af0305 86 {
RyoheiHagimoto 0:0e0631af0305 87 __m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_g1);
RyoheiHagimoto 0:0e0631af0305 88 __m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_g1);
RyoheiHagimoto 0:0e0631af0305 89 __m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_b0);
RyoheiHagimoto 0:0e0631af0305 90 __m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_b0);
RyoheiHagimoto 0:0e0631af0305 91 __m128i layer1_chunk4 = _mm_unpacklo_epi8(v_g0, v_b1);
RyoheiHagimoto 0:0e0631af0305 92 __m128i layer1_chunk5 = _mm_unpackhi_epi8(v_g0, v_b1);
RyoheiHagimoto 0:0e0631af0305 93
RyoheiHagimoto 0:0e0631af0305 94 __m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk3);
RyoheiHagimoto 0:0e0631af0305 95 __m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk3);
RyoheiHagimoto 0:0e0631af0305 96 __m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk4);
RyoheiHagimoto 0:0e0631af0305 97 __m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk4);
RyoheiHagimoto 0:0e0631af0305 98 __m128i layer2_chunk4 = _mm_unpacklo_epi8(layer1_chunk2, layer1_chunk5);
RyoheiHagimoto 0:0e0631af0305 99 __m128i layer2_chunk5 = _mm_unpackhi_epi8(layer1_chunk2, layer1_chunk5);
RyoheiHagimoto 0:0e0631af0305 100
RyoheiHagimoto 0:0e0631af0305 101 __m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk3);
RyoheiHagimoto 0:0e0631af0305 102 __m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk3);
RyoheiHagimoto 0:0e0631af0305 103 __m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk4);
RyoheiHagimoto 0:0e0631af0305 104 __m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk4);
RyoheiHagimoto 0:0e0631af0305 105 __m128i layer3_chunk4 = _mm_unpacklo_epi8(layer2_chunk2, layer2_chunk5);
RyoheiHagimoto 0:0e0631af0305 106 __m128i layer3_chunk5 = _mm_unpackhi_epi8(layer2_chunk2, layer2_chunk5);
RyoheiHagimoto 0:0e0631af0305 107
RyoheiHagimoto 0:0e0631af0305 108 __m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk3);
RyoheiHagimoto 0:0e0631af0305 109 __m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk3);
RyoheiHagimoto 0:0e0631af0305 110 __m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk4);
RyoheiHagimoto 0:0e0631af0305 111 __m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk4);
RyoheiHagimoto 0:0e0631af0305 112 __m128i layer4_chunk4 = _mm_unpacklo_epi8(layer3_chunk2, layer3_chunk5);
RyoheiHagimoto 0:0e0631af0305 113 __m128i layer4_chunk5 = _mm_unpackhi_epi8(layer3_chunk2, layer3_chunk5);
RyoheiHagimoto 0:0e0631af0305 114
RyoheiHagimoto 0:0e0631af0305 115 v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk3);
RyoheiHagimoto 0:0e0631af0305 116 v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk3);
RyoheiHagimoto 0:0e0631af0305 117 v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk4);
RyoheiHagimoto 0:0e0631af0305 118 v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk4);
RyoheiHagimoto 0:0e0631af0305 119 v_b0 = _mm_unpacklo_epi8(layer4_chunk2, layer4_chunk5);
RyoheiHagimoto 0:0e0631af0305 120 v_b1 = _mm_unpackhi_epi8(layer4_chunk2, layer4_chunk5);
RyoheiHagimoto 0:0e0631af0305 121 }
RyoheiHagimoto 0:0e0631af0305 122
RyoheiHagimoto 0:0e0631af0305 123 inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
RyoheiHagimoto 0:0e0631af0305 124 __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
RyoheiHagimoto 0:0e0631af0305 125 {
RyoheiHagimoto 0:0e0631af0305 126 __m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_b0);
RyoheiHagimoto 0:0e0631af0305 127 __m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_b0);
RyoheiHagimoto 0:0e0631af0305 128 __m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_b1);
RyoheiHagimoto 0:0e0631af0305 129 __m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_b1);
RyoheiHagimoto 0:0e0631af0305 130 __m128i layer1_chunk4 = _mm_unpacklo_epi8(v_g0, v_a0);
RyoheiHagimoto 0:0e0631af0305 131 __m128i layer1_chunk5 = _mm_unpackhi_epi8(v_g0, v_a0);
RyoheiHagimoto 0:0e0631af0305 132 __m128i layer1_chunk6 = _mm_unpacklo_epi8(v_g1, v_a1);
RyoheiHagimoto 0:0e0631af0305 133 __m128i layer1_chunk7 = _mm_unpackhi_epi8(v_g1, v_a1);
RyoheiHagimoto 0:0e0631af0305 134
RyoheiHagimoto 0:0e0631af0305 135 __m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk4);
RyoheiHagimoto 0:0e0631af0305 136 __m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk4);
RyoheiHagimoto 0:0e0631af0305 137 __m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk5);
RyoheiHagimoto 0:0e0631af0305 138 __m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk5);
RyoheiHagimoto 0:0e0631af0305 139 __m128i layer2_chunk4 = _mm_unpacklo_epi8(layer1_chunk2, layer1_chunk6);
RyoheiHagimoto 0:0e0631af0305 140 __m128i layer2_chunk5 = _mm_unpackhi_epi8(layer1_chunk2, layer1_chunk6);
RyoheiHagimoto 0:0e0631af0305 141 __m128i layer2_chunk6 = _mm_unpacklo_epi8(layer1_chunk3, layer1_chunk7);
RyoheiHagimoto 0:0e0631af0305 142 __m128i layer2_chunk7 = _mm_unpackhi_epi8(layer1_chunk3, layer1_chunk7);
RyoheiHagimoto 0:0e0631af0305 143
RyoheiHagimoto 0:0e0631af0305 144 __m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk4);
RyoheiHagimoto 0:0e0631af0305 145 __m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk4);
RyoheiHagimoto 0:0e0631af0305 146 __m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk5);
RyoheiHagimoto 0:0e0631af0305 147 __m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk5);
RyoheiHagimoto 0:0e0631af0305 148 __m128i layer3_chunk4 = _mm_unpacklo_epi8(layer2_chunk2, layer2_chunk6);
RyoheiHagimoto 0:0e0631af0305 149 __m128i layer3_chunk5 = _mm_unpackhi_epi8(layer2_chunk2, layer2_chunk6);
RyoheiHagimoto 0:0e0631af0305 150 __m128i layer3_chunk6 = _mm_unpacklo_epi8(layer2_chunk3, layer2_chunk7);
RyoheiHagimoto 0:0e0631af0305 151 __m128i layer3_chunk7 = _mm_unpackhi_epi8(layer2_chunk3, layer2_chunk7);
RyoheiHagimoto 0:0e0631af0305 152
RyoheiHagimoto 0:0e0631af0305 153 __m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk4);
RyoheiHagimoto 0:0e0631af0305 154 __m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk4);
RyoheiHagimoto 0:0e0631af0305 155 __m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk5);
RyoheiHagimoto 0:0e0631af0305 156 __m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk5);
RyoheiHagimoto 0:0e0631af0305 157 __m128i layer4_chunk4 = _mm_unpacklo_epi8(layer3_chunk2, layer3_chunk6);
RyoheiHagimoto 0:0e0631af0305 158 __m128i layer4_chunk5 = _mm_unpackhi_epi8(layer3_chunk2, layer3_chunk6);
RyoheiHagimoto 0:0e0631af0305 159 __m128i layer4_chunk6 = _mm_unpacklo_epi8(layer3_chunk3, layer3_chunk7);
RyoheiHagimoto 0:0e0631af0305 160 __m128i layer4_chunk7 = _mm_unpackhi_epi8(layer3_chunk3, layer3_chunk7);
RyoheiHagimoto 0:0e0631af0305 161
RyoheiHagimoto 0:0e0631af0305 162 v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk4);
RyoheiHagimoto 0:0e0631af0305 163 v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk4);
RyoheiHagimoto 0:0e0631af0305 164 v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk5);
RyoheiHagimoto 0:0e0631af0305 165 v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk5);
RyoheiHagimoto 0:0e0631af0305 166 v_b0 = _mm_unpacklo_epi8(layer4_chunk2, layer4_chunk6);
RyoheiHagimoto 0:0e0631af0305 167 v_b1 = _mm_unpackhi_epi8(layer4_chunk2, layer4_chunk6);
RyoheiHagimoto 0:0e0631af0305 168 v_a0 = _mm_unpacklo_epi8(layer4_chunk3, layer4_chunk7);
RyoheiHagimoto 0:0e0631af0305 169 v_a1 = _mm_unpackhi_epi8(layer4_chunk3, layer4_chunk7);
RyoheiHagimoto 0:0e0631af0305 170 }
RyoheiHagimoto 0:0e0631af0305 171
RyoheiHagimoto 0:0e0631af0305 172 inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
RyoheiHagimoto 0:0e0631af0305 173 {
RyoheiHagimoto 0:0e0631af0305 174 __m128i v_mask = _mm_set1_epi16(0x00ff);
RyoheiHagimoto 0:0e0631af0305 175
RyoheiHagimoto 0:0e0631af0305 176 __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
RyoheiHagimoto 0:0e0631af0305 177 __m128i layer4_chunk2 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8));
RyoheiHagimoto 0:0e0631af0305 178 __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
RyoheiHagimoto 0:0e0631af0305 179 __m128i layer4_chunk3 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8));
RyoheiHagimoto 0:0e0631af0305 180
RyoheiHagimoto 0:0e0631af0305 181 __m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask));
RyoheiHagimoto 0:0e0631af0305 182 __m128i layer3_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8));
RyoheiHagimoto 0:0e0631af0305 183 __m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask));
RyoheiHagimoto 0:0e0631af0305 184 __m128i layer3_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8));
RyoheiHagimoto 0:0e0631af0305 185
RyoheiHagimoto 0:0e0631af0305 186 __m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
RyoheiHagimoto 0:0e0631af0305 187 __m128i layer2_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8));
RyoheiHagimoto 0:0e0631af0305 188 __m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
RyoheiHagimoto 0:0e0631af0305 189 __m128i layer2_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8));
RyoheiHagimoto 0:0e0631af0305 190
RyoheiHagimoto 0:0e0631af0305 191 __m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
RyoheiHagimoto 0:0e0631af0305 192 __m128i layer1_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8));
RyoheiHagimoto 0:0e0631af0305 193 __m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
RyoheiHagimoto 0:0e0631af0305 194 __m128i layer1_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8));
RyoheiHagimoto 0:0e0631af0305 195
RyoheiHagimoto 0:0e0631af0305 196 v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
RyoheiHagimoto 0:0e0631af0305 197 v_g0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8));
RyoheiHagimoto 0:0e0631af0305 198 v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
RyoheiHagimoto 0:0e0631af0305 199 v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8));
RyoheiHagimoto 0:0e0631af0305 200 }
RyoheiHagimoto 0:0e0631af0305 201
RyoheiHagimoto 0:0e0631af0305 202 inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
RyoheiHagimoto 0:0e0631af0305 203 __m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
RyoheiHagimoto 0:0e0631af0305 204 {
RyoheiHagimoto 0:0e0631af0305 205 __m128i v_mask = _mm_set1_epi16(0x00ff);
RyoheiHagimoto 0:0e0631af0305 206
RyoheiHagimoto 0:0e0631af0305 207 __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
RyoheiHagimoto 0:0e0631af0305 208 __m128i layer4_chunk3 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8));
RyoheiHagimoto 0:0e0631af0305 209 __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
RyoheiHagimoto 0:0e0631af0305 210 __m128i layer4_chunk4 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8));
RyoheiHagimoto 0:0e0631af0305 211 __m128i layer4_chunk2 = _mm_packus_epi16(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
RyoheiHagimoto 0:0e0631af0305 212 __m128i layer4_chunk5 = _mm_packus_epi16(_mm_srli_epi16(v_b0, 8), _mm_srli_epi16(v_b1, 8));
RyoheiHagimoto 0:0e0631af0305 213
RyoheiHagimoto 0:0e0631af0305 214 __m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask));
RyoheiHagimoto 0:0e0631af0305 215 __m128i layer3_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8));
RyoheiHagimoto 0:0e0631af0305 216 __m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask));
RyoheiHagimoto 0:0e0631af0305 217 __m128i layer3_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8));
RyoheiHagimoto 0:0e0631af0305 218 __m128i layer3_chunk2 = _mm_packus_epi16(_mm_and_si128(layer4_chunk4, v_mask), _mm_and_si128(layer4_chunk5, v_mask));
RyoheiHagimoto 0:0e0631af0305 219 __m128i layer3_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk4, 8), _mm_srli_epi16(layer4_chunk5, 8));
RyoheiHagimoto 0:0e0631af0305 220
RyoheiHagimoto 0:0e0631af0305 221 __m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
RyoheiHagimoto 0:0e0631af0305 222 __m128i layer2_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8));
RyoheiHagimoto 0:0e0631af0305 223 __m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
RyoheiHagimoto 0:0e0631af0305 224 __m128i layer2_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8));
RyoheiHagimoto 0:0e0631af0305 225 __m128i layer2_chunk2 = _mm_packus_epi16(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
RyoheiHagimoto 0:0e0631af0305 226 __m128i layer2_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk4, 8), _mm_srli_epi16(layer3_chunk5, 8));
RyoheiHagimoto 0:0e0631af0305 227
RyoheiHagimoto 0:0e0631af0305 228 __m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
RyoheiHagimoto 0:0e0631af0305 229 __m128i layer1_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8));
RyoheiHagimoto 0:0e0631af0305 230 __m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
RyoheiHagimoto 0:0e0631af0305 231 __m128i layer1_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8));
RyoheiHagimoto 0:0e0631af0305 232 __m128i layer1_chunk2 = _mm_packus_epi16(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
RyoheiHagimoto 0:0e0631af0305 233 __m128i layer1_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk4, 8), _mm_srli_epi16(layer2_chunk5, 8));
RyoheiHagimoto 0:0e0631af0305 234
RyoheiHagimoto 0:0e0631af0305 235 v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
RyoheiHagimoto 0:0e0631af0305 236 v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8));
RyoheiHagimoto 0:0e0631af0305 237 v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
RyoheiHagimoto 0:0e0631af0305 238 v_b0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8));
RyoheiHagimoto 0:0e0631af0305 239 v_g0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
RyoheiHagimoto 0:0e0631af0305 240 v_b1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk4, 8), _mm_srli_epi16(layer1_chunk5, 8));
RyoheiHagimoto 0:0e0631af0305 241 }
RyoheiHagimoto 0:0e0631af0305 242
RyoheiHagimoto 0:0e0631af0305 243 inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
RyoheiHagimoto 0:0e0631af0305 244 __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
RyoheiHagimoto 0:0e0631af0305 245 {
RyoheiHagimoto 0:0e0631af0305 246 __m128i v_mask = _mm_set1_epi16(0x00ff);
RyoheiHagimoto 0:0e0631af0305 247
RyoheiHagimoto 0:0e0631af0305 248 __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
RyoheiHagimoto 0:0e0631af0305 249 __m128i layer4_chunk4 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8));
RyoheiHagimoto 0:0e0631af0305 250 __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
RyoheiHagimoto 0:0e0631af0305 251 __m128i layer4_chunk5 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8));
RyoheiHagimoto 0:0e0631af0305 252 __m128i layer4_chunk2 = _mm_packus_epi16(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
RyoheiHagimoto 0:0e0631af0305 253 __m128i layer4_chunk6 = _mm_packus_epi16(_mm_srli_epi16(v_b0, 8), _mm_srli_epi16(v_b1, 8));
RyoheiHagimoto 0:0e0631af0305 254 __m128i layer4_chunk3 = _mm_packus_epi16(_mm_and_si128(v_a0, v_mask), _mm_and_si128(v_a1, v_mask));
RyoheiHagimoto 0:0e0631af0305 255 __m128i layer4_chunk7 = _mm_packus_epi16(_mm_srli_epi16(v_a0, 8), _mm_srli_epi16(v_a1, 8));
RyoheiHagimoto 0:0e0631af0305 256
RyoheiHagimoto 0:0e0631af0305 257 __m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask));
RyoheiHagimoto 0:0e0631af0305 258 __m128i layer3_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8));
RyoheiHagimoto 0:0e0631af0305 259 __m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask));
RyoheiHagimoto 0:0e0631af0305 260 __m128i layer3_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8));
RyoheiHagimoto 0:0e0631af0305 261 __m128i layer3_chunk2 = _mm_packus_epi16(_mm_and_si128(layer4_chunk4, v_mask), _mm_and_si128(layer4_chunk5, v_mask));
RyoheiHagimoto 0:0e0631af0305 262 __m128i layer3_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk4, 8), _mm_srli_epi16(layer4_chunk5, 8));
RyoheiHagimoto 0:0e0631af0305 263 __m128i layer3_chunk3 = _mm_packus_epi16(_mm_and_si128(layer4_chunk6, v_mask), _mm_and_si128(layer4_chunk7, v_mask));
RyoheiHagimoto 0:0e0631af0305 264 __m128i layer3_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk6, 8), _mm_srli_epi16(layer4_chunk7, 8));
RyoheiHagimoto 0:0e0631af0305 265
RyoheiHagimoto 0:0e0631af0305 266 __m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
RyoheiHagimoto 0:0e0631af0305 267 __m128i layer2_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8));
RyoheiHagimoto 0:0e0631af0305 268 __m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
RyoheiHagimoto 0:0e0631af0305 269 __m128i layer2_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8));
RyoheiHagimoto 0:0e0631af0305 270 __m128i layer2_chunk2 = _mm_packus_epi16(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
RyoheiHagimoto 0:0e0631af0305 271 __m128i layer2_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk4, 8), _mm_srli_epi16(layer3_chunk5, 8));
RyoheiHagimoto 0:0e0631af0305 272 __m128i layer2_chunk3 = _mm_packus_epi16(_mm_and_si128(layer3_chunk6, v_mask), _mm_and_si128(layer3_chunk7, v_mask));
RyoheiHagimoto 0:0e0631af0305 273 __m128i layer2_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk6, 8), _mm_srli_epi16(layer3_chunk7, 8));
RyoheiHagimoto 0:0e0631af0305 274
RyoheiHagimoto 0:0e0631af0305 275 __m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
RyoheiHagimoto 0:0e0631af0305 276 __m128i layer1_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8));
RyoheiHagimoto 0:0e0631af0305 277 __m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
RyoheiHagimoto 0:0e0631af0305 278 __m128i layer1_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8));
RyoheiHagimoto 0:0e0631af0305 279 __m128i layer1_chunk2 = _mm_packus_epi16(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
RyoheiHagimoto 0:0e0631af0305 280 __m128i layer1_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk4, 8), _mm_srli_epi16(layer2_chunk5, 8));
RyoheiHagimoto 0:0e0631af0305 281 __m128i layer1_chunk3 = _mm_packus_epi16(_mm_and_si128(layer2_chunk6, v_mask), _mm_and_si128(layer2_chunk7, v_mask));
RyoheiHagimoto 0:0e0631af0305 282 __m128i layer1_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk6, 8), _mm_srli_epi16(layer2_chunk7, 8));
RyoheiHagimoto 0:0e0631af0305 283
RyoheiHagimoto 0:0e0631af0305 284 v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
RyoheiHagimoto 0:0e0631af0305 285 v_b0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8));
RyoheiHagimoto 0:0e0631af0305 286 v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
RyoheiHagimoto 0:0e0631af0305 287 v_b1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8));
RyoheiHagimoto 0:0e0631af0305 288 v_g0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
RyoheiHagimoto 0:0e0631af0305 289 v_a0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk4, 8), _mm_srli_epi16(layer1_chunk5, 8));
RyoheiHagimoto 0:0e0631af0305 290 v_g1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk6, v_mask), _mm_and_si128(layer1_chunk7, v_mask));
RyoheiHagimoto 0:0e0631af0305 291 v_a1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk6, 8), _mm_srli_epi16(layer1_chunk7, 8));
RyoheiHagimoto 0:0e0631af0305 292 }
RyoheiHagimoto 0:0e0631af0305 293
RyoheiHagimoto 0:0e0631af0305 294 inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
RyoheiHagimoto 0:0e0631af0305 295 {
RyoheiHagimoto 0:0e0631af0305 296 __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_g0);
RyoheiHagimoto 0:0e0631af0305 297 __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_g0);
RyoheiHagimoto 0:0e0631af0305 298 __m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_g1);
RyoheiHagimoto 0:0e0631af0305 299 __m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_g1);
RyoheiHagimoto 0:0e0631af0305 300
RyoheiHagimoto 0:0e0631af0305 301 __m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk2);
RyoheiHagimoto 0:0e0631af0305 302 __m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk2);
RyoheiHagimoto 0:0e0631af0305 303 __m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk3);
RyoheiHagimoto 0:0e0631af0305 304 __m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk3);
RyoheiHagimoto 0:0e0631af0305 305
RyoheiHagimoto 0:0e0631af0305 306 __m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk2);
RyoheiHagimoto 0:0e0631af0305 307 __m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk2);
RyoheiHagimoto 0:0e0631af0305 308 __m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk3);
RyoheiHagimoto 0:0e0631af0305 309 __m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk3);
RyoheiHagimoto 0:0e0631af0305 310
RyoheiHagimoto 0:0e0631af0305 311 v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk2);
RyoheiHagimoto 0:0e0631af0305 312 v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk2);
RyoheiHagimoto 0:0e0631af0305 313 v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk3);
RyoheiHagimoto 0:0e0631af0305 314 v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk3);
RyoheiHagimoto 0:0e0631af0305 315 }
RyoheiHagimoto 0:0e0631af0305 316
RyoheiHagimoto 0:0e0631af0305 317 inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
RyoheiHagimoto 0:0e0631af0305 318 __m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
RyoheiHagimoto 0:0e0631af0305 319 {
RyoheiHagimoto 0:0e0631af0305 320 __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_g1);
RyoheiHagimoto 0:0e0631af0305 321 __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_g1);
RyoheiHagimoto 0:0e0631af0305 322 __m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_b0);
RyoheiHagimoto 0:0e0631af0305 323 __m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_b0);
RyoheiHagimoto 0:0e0631af0305 324 __m128i layer1_chunk4 = _mm_unpacklo_epi16(v_g0, v_b1);
RyoheiHagimoto 0:0e0631af0305 325 __m128i layer1_chunk5 = _mm_unpackhi_epi16(v_g0, v_b1);
RyoheiHagimoto 0:0e0631af0305 326
RyoheiHagimoto 0:0e0631af0305 327 __m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk3);
RyoheiHagimoto 0:0e0631af0305 328 __m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk3);
RyoheiHagimoto 0:0e0631af0305 329 __m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk4);
RyoheiHagimoto 0:0e0631af0305 330 __m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk4);
RyoheiHagimoto 0:0e0631af0305 331 __m128i layer2_chunk4 = _mm_unpacklo_epi16(layer1_chunk2, layer1_chunk5);
RyoheiHagimoto 0:0e0631af0305 332 __m128i layer2_chunk5 = _mm_unpackhi_epi16(layer1_chunk2, layer1_chunk5);
RyoheiHagimoto 0:0e0631af0305 333
RyoheiHagimoto 0:0e0631af0305 334 __m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk3);
RyoheiHagimoto 0:0e0631af0305 335 __m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk3);
RyoheiHagimoto 0:0e0631af0305 336 __m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk4);
RyoheiHagimoto 0:0e0631af0305 337 __m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk4);
RyoheiHagimoto 0:0e0631af0305 338 __m128i layer3_chunk4 = _mm_unpacklo_epi16(layer2_chunk2, layer2_chunk5);
RyoheiHagimoto 0:0e0631af0305 339 __m128i layer3_chunk5 = _mm_unpackhi_epi16(layer2_chunk2, layer2_chunk5);
RyoheiHagimoto 0:0e0631af0305 340
RyoheiHagimoto 0:0e0631af0305 341 v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk3);
RyoheiHagimoto 0:0e0631af0305 342 v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk3);
RyoheiHagimoto 0:0e0631af0305 343 v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk4);
RyoheiHagimoto 0:0e0631af0305 344 v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk4);
RyoheiHagimoto 0:0e0631af0305 345 v_b0 = _mm_unpacklo_epi16(layer3_chunk2, layer3_chunk5);
RyoheiHagimoto 0:0e0631af0305 346 v_b1 = _mm_unpackhi_epi16(layer3_chunk2, layer3_chunk5);
RyoheiHagimoto 0:0e0631af0305 347 }
RyoheiHagimoto 0:0e0631af0305 348
RyoheiHagimoto 0:0e0631af0305 349 inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
RyoheiHagimoto 0:0e0631af0305 350 __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
RyoheiHagimoto 0:0e0631af0305 351 {
RyoheiHagimoto 0:0e0631af0305 352 __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_b0);
RyoheiHagimoto 0:0e0631af0305 353 __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_b0);
RyoheiHagimoto 0:0e0631af0305 354 __m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_b1);
RyoheiHagimoto 0:0e0631af0305 355 __m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_b1);
RyoheiHagimoto 0:0e0631af0305 356 __m128i layer1_chunk4 = _mm_unpacklo_epi16(v_g0, v_a0);
RyoheiHagimoto 0:0e0631af0305 357 __m128i layer1_chunk5 = _mm_unpackhi_epi16(v_g0, v_a0);
RyoheiHagimoto 0:0e0631af0305 358 __m128i layer1_chunk6 = _mm_unpacklo_epi16(v_g1, v_a1);
RyoheiHagimoto 0:0e0631af0305 359 __m128i layer1_chunk7 = _mm_unpackhi_epi16(v_g1, v_a1);
RyoheiHagimoto 0:0e0631af0305 360
RyoheiHagimoto 0:0e0631af0305 361 __m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk4);
RyoheiHagimoto 0:0e0631af0305 362 __m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk4);
RyoheiHagimoto 0:0e0631af0305 363 __m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk5);
RyoheiHagimoto 0:0e0631af0305 364 __m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk5);
RyoheiHagimoto 0:0e0631af0305 365 __m128i layer2_chunk4 = _mm_unpacklo_epi16(layer1_chunk2, layer1_chunk6);
RyoheiHagimoto 0:0e0631af0305 366 __m128i layer2_chunk5 = _mm_unpackhi_epi16(layer1_chunk2, layer1_chunk6);
RyoheiHagimoto 0:0e0631af0305 367 __m128i layer2_chunk6 = _mm_unpacklo_epi16(layer1_chunk3, layer1_chunk7);
RyoheiHagimoto 0:0e0631af0305 368 __m128i layer2_chunk7 = _mm_unpackhi_epi16(layer1_chunk3, layer1_chunk7);
RyoheiHagimoto 0:0e0631af0305 369
RyoheiHagimoto 0:0e0631af0305 370 __m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk4);
RyoheiHagimoto 0:0e0631af0305 371 __m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk4);
RyoheiHagimoto 0:0e0631af0305 372 __m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk5);
RyoheiHagimoto 0:0e0631af0305 373 __m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk5);
RyoheiHagimoto 0:0e0631af0305 374 __m128i layer3_chunk4 = _mm_unpacklo_epi16(layer2_chunk2, layer2_chunk6);
RyoheiHagimoto 0:0e0631af0305 375 __m128i layer3_chunk5 = _mm_unpackhi_epi16(layer2_chunk2, layer2_chunk6);
RyoheiHagimoto 0:0e0631af0305 376 __m128i layer3_chunk6 = _mm_unpacklo_epi16(layer2_chunk3, layer2_chunk7);
RyoheiHagimoto 0:0e0631af0305 377 __m128i layer3_chunk7 = _mm_unpackhi_epi16(layer2_chunk3, layer2_chunk7);
RyoheiHagimoto 0:0e0631af0305 378
RyoheiHagimoto 0:0e0631af0305 379 v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk4);
RyoheiHagimoto 0:0e0631af0305 380 v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk4);
RyoheiHagimoto 0:0e0631af0305 381 v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk5);
RyoheiHagimoto 0:0e0631af0305 382 v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk5);
RyoheiHagimoto 0:0e0631af0305 383 v_b0 = _mm_unpacklo_epi16(layer3_chunk2, layer3_chunk6);
RyoheiHagimoto 0:0e0631af0305 384 v_b1 = _mm_unpackhi_epi16(layer3_chunk2, layer3_chunk6);
RyoheiHagimoto 0:0e0631af0305 385 v_a0 = _mm_unpacklo_epi16(layer3_chunk3, layer3_chunk7);
RyoheiHagimoto 0:0e0631af0305 386 v_a1 = _mm_unpackhi_epi16(layer3_chunk3, layer3_chunk7);
RyoheiHagimoto 0:0e0631af0305 387 }
RyoheiHagimoto 0:0e0631af0305 388
RyoheiHagimoto 0:0e0631af0305 389 #if CV_SSE4_1
RyoheiHagimoto 0:0e0631af0305 390
RyoheiHagimoto 0:0e0631af0305 391 inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
RyoheiHagimoto 0:0e0631af0305 392 {
RyoheiHagimoto 0:0e0631af0305 393 __m128i v_mask = _mm_set1_epi32(0x0000ffff);
RyoheiHagimoto 0:0e0631af0305 394
RyoheiHagimoto 0:0e0631af0305 395 __m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
RyoheiHagimoto 0:0e0631af0305 396 __m128i layer3_chunk2 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16));
RyoheiHagimoto 0:0e0631af0305 397 __m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
RyoheiHagimoto 0:0e0631af0305 398 __m128i layer3_chunk3 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16));
RyoheiHagimoto 0:0e0631af0305 399
RyoheiHagimoto 0:0e0631af0305 400 __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
RyoheiHagimoto 0:0e0631af0305 401 __m128i layer2_chunk2 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16));
RyoheiHagimoto 0:0e0631af0305 402 __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
RyoheiHagimoto 0:0e0631af0305 403 __m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16));
RyoheiHagimoto 0:0e0631af0305 404
RyoheiHagimoto 0:0e0631af0305 405 __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
RyoheiHagimoto 0:0e0631af0305 406 __m128i layer1_chunk2 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16));
RyoheiHagimoto 0:0e0631af0305 407 __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
RyoheiHagimoto 0:0e0631af0305 408 __m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16));
RyoheiHagimoto 0:0e0631af0305 409
RyoheiHagimoto 0:0e0631af0305 410 v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
RyoheiHagimoto 0:0e0631af0305 411 v_g0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16));
RyoheiHagimoto 0:0e0631af0305 412 v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
RyoheiHagimoto 0:0e0631af0305 413 v_g1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16));
RyoheiHagimoto 0:0e0631af0305 414 }
RyoheiHagimoto 0:0e0631af0305 415
RyoheiHagimoto 0:0e0631af0305 416 inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
RyoheiHagimoto 0:0e0631af0305 417 __m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
RyoheiHagimoto 0:0e0631af0305 418 {
RyoheiHagimoto 0:0e0631af0305 419 __m128i v_mask = _mm_set1_epi32(0x0000ffff);
RyoheiHagimoto 0:0e0631af0305 420
RyoheiHagimoto 0:0e0631af0305 421 __m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
RyoheiHagimoto 0:0e0631af0305 422 __m128i layer3_chunk3 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16));
RyoheiHagimoto 0:0e0631af0305 423 __m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
RyoheiHagimoto 0:0e0631af0305 424 __m128i layer3_chunk4 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16));
RyoheiHagimoto 0:0e0631af0305 425 __m128i layer3_chunk2 = _mm_packus_epi32(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
RyoheiHagimoto 0:0e0631af0305 426 __m128i layer3_chunk5 = _mm_packus_epi32(_mm_srli_epi32(v_b0, 16), _mm_srli_epi32(v_b1, 16));
RyoheiHagimoto 0:0e0631af0305 427
RyoheiHagimoto 0:0e0631af0305 428 __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
RyoheiHagimoto 0:0e0631af0305 429 __m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16));
RyoheiHagimoto 0:0e0631af0305 430 __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
RyoheiHagimoto 0:0e0631af0305 431 __m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16));
RyoheiHagimoto 0:0e0631af0305 432 __m128i layer2_chunk2 = _mm_packus_epi32(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
RyoheiHagimoto 0:0e0631af0305 433 __m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16));
RyoheiHagimoto 0:0e0631af0305 434
RyoheiHagimoto 0:0e0631af0305 435 __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
RyoheiHagimoto 0:0e0631af0305 436 __m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16));
RyoheiHagimoto 0:0e0631af0305 437 __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
RyoheiHagimoto 0:0e0631af0305 438 __m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16));
RyoheiHagimoto 0:0e0631af0305 439 __m128i layer1_chunk2 = _mm_packus_epi32(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
RyoheiHagimoto 0:0e0631af0305 440 __m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16));
RyoheiHagimoto 0:0e0631af0305 441
RyoheiHagimoto 0:0e0631af0305 442 v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
RyoheiHagimoto 0:0e0631af0305 443 v_g1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16));
RyoheiHagimoto 0:0e0631af0305 444 v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
RyoheiHagimoto 0:0e0631af0305 445 v_b0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16));
RyoheiHagimoto 0:0e0631af0305 446 v_g0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
RyoheiHagimoto 0:0e0631af0305 447 v_b1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk4, 16), _mm_srli_epi32(layer1_chunk5, 16));
RyoheiHagimoto 0:0e0631af0305 448 }
RyoheiHagimoto 0:0e0631af0305 449
RyoheiHagimoto 0:0e0631af0305 450 inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
RyoheiHagimoto 0:0e0631af0305 451 __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
RyoheiHagimoto 0:0e0631af0305 452 {
RyoheiHagimoto 0:0e0631af0305 453 __m128i v_mask = _mm_set1_epi32(0x0000ffff);
RyoheiHagimoto 0:0e0631af0305 454
RyoheiHagimoto 0:0e0631af0305 455 __m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
RyoheiHagimoto 0:0e0631af0305 456 __m128i layer3_chunk4 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16));
RyoheiHagimoto 0:0e0631af0305 457 __m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
RyoheiHagimoto 0:0e0631af0305 458 __m128i layer3_chunk5 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16));
RyoheiHagimoto 0:0e0631af0305 459 __m128i layer3_chunk2 = _mm_packus_epi32(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
RyoheiHagimoto 0:0e0631af0305 460 __m128i layer3_chunk6 = _mm_packus_epi32(_mm_srli_epi32(v_b0, 16), _mm_srli_epi32(v_b1, 16));
RyoheiHagimoto 0:0e0631af0305 461 __m128i layer3_chunk3 = _mm_packus_epi32(_mm_and_si128(v_a0, v_mask), _mm_and_si128(v_a1, v_mask));
RyoheiHagimoto 0:0e0631af0305 462 __m128i layer3_chunk7 = _mm_packus_epi32(_mm_srli_epi32(v_a0, 16), _mm_srli_epi32(v_a1, 16));
RyoheiHagimoto 0:0e0631af0305 463
RyoheiHagimoto 0:0e0631af0305 464 __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
RyoheiHagimoto 0:0e0631af0305 465 __m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16));
RyoheiHagimoto 0:0e0631af0305 466 __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
RyoheiHagimoto 0:0e0631af0305 467 __m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16));
RyoheiHagimoto 0:0e0631af0305 468 __m128i layer2_chunk2 = _mm_packus_epi32(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
RyoheiHagimoto 0:0e0631af0305 469 __m128i layer2_chunk6 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16));
RyoheiHagimoto 0:0e0631af0305 470 __m128i layer2_chunk3 = _mm_packus_epi32(_mm_and_si128(layer3_chunk6, v_mask), _mm_and_si128(layer3_chunk7, v_mask));
RyoheiHagimoto 0:0e0631af0305 471 __m128i layer2_chunk7 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk6, 16), _mm_srli_epi32(layer3_chunk7, 16));
RyoheiHagimoto 0:0e0631af0305 472
RyoheiHagimoto 0:0e0631af0305 473 __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
RyoheiHagimoto 0:0e0631af0305 474 __m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16));
RyoheiHagimoto 0:0e0631af0305 475 __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
RyoheiHagimoto 0:0e0631af0305 476 __m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16));
RyoheiHagimoto 0:0e0631af0305 477 __m128i layer1_chunk2 = _mm_packus_epi32(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
RyoheiHagimoto 0:0e0631af0305 478 __m128i layer1_chunk6 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16));
RyoheiHagimoto 0:0e0631af0305 479 __m128i layer1_chunk3 = _mm_packus_epi32(_mm_and_si128(layer2_chunk6, v_mask), _mm_and_si128(layer2_chunk7, v_mask));
RyoheiHagimoto 0:0e0631af0305 480 __m128i layer1_chunk7 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk6, 16), _mm_srli_epi32(layer2_chunk7, 16));
RyoheiHagimoto 0:0e0631af0305 481
RyoheiHagimoto 0:0e0631af0305 482 v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
RyoheiHagimoto 0:0e0631af0305 483 v_b0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16));
RyoheiHagimoto 0:0e0631af0305 484 v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
RyoheiHagimoto 0:0e0631af0305 485 v_b1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16));
RyoheiHagimoto 0:0e0631af0305 486 v_g0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
RyoheiHagimoto 0:0e0631af0305 487 v_a0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk4, 16), _mm_srli_epi32(layer1_chunk5, 16));
RyoheiHagimoto 0:0e0631af0305 488 v_g1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk6, v_mask), _mm_and_si128(layer1_chunk7, v_mask));
RyoheiHagimoto 0:0e0631af0305 489 v_a1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk6, 16), _mm_srli_epi32(layer1_chunk7, 16));
RyoheiHagimoto 0:0e0631af0305 490 }
RyoheiHagimoto 0:0e0631af0305 491
RyoheiHagimoto 0:0e0631af0305 492 #endif // CV_SSE4_1
RyoheiHagimoto 0:0e0631af0305 493
RyoheiHagimoto 0:0e0631af0305 494 inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1)
RyoheiHagimoto 0:0e0631af0305 495 {
RyoheiHagimoto 0:0e0631af0305 496 __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_g0);
RyoheiHagimoto 0:0e0631af0305 497 __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_g0);
RyoheiHagimoto 0:0e0631af0305 498 __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_g1);
RyoheiHagimoto 0:0e0631af0305 499 __m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_g1);
RyoheiHagimoto 0:0e0631af0305 500
RyoheiHagimoto 0:0e0631af0305 501 __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk2);
RyoheiHagimoto 0:0e0631af0305 502 __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk2);
RyoheiHagimoto 0:0e0631af0305 503 __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk3);
RyoheiHagimoto 0:0e0631af0305 504 __m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk3);
RyoheiHagimoto 0:0e0631af0305 505
RyoheiHagimoto 0:0e0631af0305 506 v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk2);
RyoheiHagimoto 0:0e0631af0305 507 v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk2);
RyoheiHagimoto 0:0e0631af0305 508 v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk3);
RyoheiHagimoto 0:0e0631af0305 509 v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk3);
RyoheiHagimoto 0:0e0631af0305 510 }
RyoheiHagimoto 0:0e0631af0305 511
RyoheiHagimoto 0:0e0631af0305 512 inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0,
RyoheiHagimoto 0:0e0631af0305 513 __m128 & v_g1, __m128 & v_b0, __m128 & v_b1)
RyoheiHagimoto 0:0e0631af0305 514 {
RyoheiHagimoto 0:0e0631af0305 515 __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_g1);
RyoheiHagimoto 0:0e0631af0305 516 __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_g1);
RyoheiHagimoto 0:0e0631af0305 517 __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_b0);
RyoheiHagimoto 0:0e0631af0305 518 __m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_b0);
RyoheiHagimoto 0:0e0631af0305 519 __m128 layer1_chunk4 = _mm_unpacklo_ps(v_g0, v_b1);
RyoheiHagimoto 0:0e0631af0305 520 __m128 layer1_chunk5 = _mm_unpackhi_ps(v_g0, v_b1);
RyoheiHagimoto 0:0e0631af0305 521
RyoheiHagimoto 0:0e0631af0305 522 __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk3);
RyoheiHagimoto 0:0e0631af0305 523 __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk3);
RyoheiHagimoto 0:0e0631af0305 524 __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk4);
RyoheiHagimoto 0:0e0631af0305 525 __m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk4);
RyoheiHagimoto 0:0e0631af0305 526 __m128 layer2_chunk4 = _mm_unpacklo_ps(layer1_chunk2, layer1_chunk5);
RyoheiHagimoto 0:0e0631af0305 527 __m128 layer2_chunk5 = _mm_unpackhi_ps(layer1_chunk2, layer1_chunk5);
RyoheiHagimoto 0:0e0631af0305 528
RyoheiHagimoto 0:0e0631af0305 529 v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk3);
RyoheiHagimoto 0:0e0631af0305 530 v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk3);
RyoheiHagimoto 0:0e0631af0305 531 v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk4);
RyoheiHagimoto 0:0e0631af0305 532 v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk4);
RyoheiHagimoto 0:0e0631af0305 533 v_b0 = _mm_unpacklo_ps(layer2_chunk2, layer2_chunk5);
RyoheiHagimoto 0:0e0631af0305 534 v_b1 = _mm_unpackhi_ps(layer2_chunk2, layer2_chunk5);
RyoheiHagimoto 0:0e0631af0305 535 }
RyoheiHagimoto 0:0e0631af0305 536
RyoheiHagimoto 0:0e0631af0305 537 inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1,
RyoheiHagimoto 0:0e0631af0305 538 __m128 & v_b0, __m128 & v_b1, __m128 & v_a0, __m128 & v_a1)
RyoheiHagimoto 0:0e0631af0305 539 {
RyoheiHagimoto 0:0e0631af0305 540 __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_b0);
RyoheiHagimoto 0:0e0631af0305 541 __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_b0);
RyoheiHagimoto 0:0e0631af0305 542 __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_b1);
RyoheiHagimoto 0:0e0631af0305 543 __m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_b1);
RyoheiHagimoto 0:0e0631af0305 544 __m128 layer1_chunk4 = _mm_unpacklo_ps(v_g0, v_a0);
RyoheiHagimoto 0:0e0631af0305 545 __m128 layer1_chunk5 = _mm_unpackhi_ps(v_g0, v_a0);
RyoheiHagimoto 0:0e0631af0305 546 __m128 layer1_chunk6 = _mm_unpacklo_ps(v_g1, v_a1);
RyoheiHagimoto 0:0e0631af0305 547 __m128 layer1_chunk7 = _mm_unpackhi_ps(v_g1, v_a1);
RyoheiHagimoto 0:0e0631af0305 548
RyoheiHagimoto 0:0e0631af0305 549 __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk4);
RyoheiHagimoto 0:0e0631af0305 550 __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk4);
RyoheiHagimoto 0:0e0631af0305 551 __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk5);
RyoheiHagimoto 0:0e0631af0305 552 __m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk5);
RyoheiHagimoto 0:0e0631af0305 553 __m128 layer2_chunk4 = _mm_unpacklo_ps(layer1_chunk2, layer1_chunk6);
RyoheiHagimoto 0:0e0631af0305 554 __m128 layer2_chunk5 = _mm_unpackhi_ps(layer1_chunk2, layer1_chunk6);
RyoheiHagimoto 0:0e0631af0305 555 __m128 layer2_chunk6 = _mm_unpacklo_ps(layer1_chunk3, layer1_chunk7);
RyoheiHagimoto 0:0e0631af0305 556 __m128 layer2_chunk7 = _mm_unpackhi_ps(layer1_chunk3, layer1_chunk7);
RyoheiHagimoto 0:0e0631af0305 557
RyoheiHagimoto 0:0e0631af0305 558 v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk4);
RyoheiHagimoto 0:0e0631af0305 559 v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk4);
RyoheiHagimoto 0:0e0631af0305 560 v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk5);
RyoheiHagimoto 0:0e0631af0305 561 v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk5);
RyoheiHagimoto 0:0e0631af0305 562 v_b0 = _mm_unpacklo_ps(layer2_chunk2, layer2_chunk6);
RyoheiHagimoto 0:0e0631af0305 563 v_b1 = _mm_unpackhi_ps(layer2_chunk2, layer2_chunk6);
RyoheiHagimoto 0:0e0631af0305 564 v_a0 = _mm_unpacklo_ps(layer2_chunk3, layer2_chunk7);
RyoheiHagimoto 0:0e0631af0305 565 v_a1 = _mm_unpackhi_ps(layer2_chunk3, layer2_chunk7);
RyoheiHagimoto 0:0e0631af0305 566 }
RyoheiHagimoto 0:0e0631af0305 567
RyoheiHagimoto 0:0e0631af0305 568 inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1)
RyoheiHagimoto 0:0e0631af0305 569 {
RyoheiHagimoto 0:0e0631af0305 570 const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
RyoheiHagimoto 0:0e0631af0305 571
RyoheiHagimoto 0:0e0631af0305 572 __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo);
RyoheiHagimoto 0:0e0631af0305 573 __m128 layer2_chunk2 = _mm_shuffle_ps(v_r0, v_r1, mask_hi);
RyoheiHagimoto 0:0e0631af0305 574 __m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo);
RyoheiHagimoto 0:0e0631af0305 575 __m128 layer2_chunk3 = _mm_shuffle_ps(v_g0, v_g1, mask_hi);
RyoheiHagimoto 0:0e0631af0305 576
RyoheiHagimoto 0:0e0631af0305 577 __m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo);
RyoheiHagimoto 0:0e0631af0305 578 __m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi);
RyoheiHagimoto 0:0e0631af0305 579 __m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo);
RyoheiHagimoto 0:0e0631af0305 580 __m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi);
RyoheiHagimoto 0:0e0631af0305 581
RyoheiHagimoto 0:0e0631af0305 582 v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo);
RyoheiHagimoto 0:0e0631af0305 583 v_g0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi);
RyoheiHagimoto 0:0e0631af0305 584 v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo);
RyoheiHagimoto 0:0e0631af0305 585 v_g1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi);
RyoheiHagimoto 0:0e0631af0305 586 }
RyoheiHagimoto 0:0e0631af0305 587
RyoheiHagimoto 0:0e0631af0305 588 inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0,
RyoheiHagimoto 0:0e0631af0305 589 __m128 & v_g1, __m128 & v_b0, __m128 & v_b1)
RyoheiHagimoto 0:0e0631af0305 590 {
RyoheiHagimoto 0:0e0631af0305 591 const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
RyoheiHagimoto 0:0e0631af0305 592
RyoheiHagimoto 0:0e0631af0305 593 __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo);
RyoheiHagimoto 0:0e0631af0305 594 __m128 layer2_chunk3 = _mm_shuffle_ps(v_r0, v_r1, mask_hi);
RyoheiHagimoto 0:0e0631af0305 595 __m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo);
RyoheiHagimoto 0:0e0631af0305 596 __m128 layer2_chunk4 = _mm_shuffle_ps(v_g0, v_g1, mask_hi);
RyoheiHagimoto 0:0e0631af0305 597 __m128 layer2_chunk2 = _mm_shuffle_ps(v_b0, v_b1, mask_lo);
RyoheiHagimoto 0:0e0631af0305 598 __m128 layer2_chunk5 = _mm_shuffle_ps(v_b0, v_b1, mask_hi);
RyoheiHagimoto 0:0e0631af0305 599
RyoheiHagimoto 0:0e0631af0305 600 __m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo);
RyoheiHagimoto 0:0e0631af0305 601 __m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi);
RyoheiHagimoto 0:0e0631af0305 602 __m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo);
RyoheiHagimoto 0:0e0631af0305 603 __m128 layer1_chunk4 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi);
RyoheiHagimoto 0:0e0631af0305 604 __m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_lo);
RyoheiHagimoto 0:0e0631af0305 605 __m128 layer1_chunk5 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_hi);
RyoheiHagimoto 0:0e0631af0305 606
RyoheiHagimoto 0:0e0631af0305 607 v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo);
RyoheiHagimoto 0:0e0631af0305 608 v_g1 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi);
RyoheiHagimoto 0:0e0631af0305 609 v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo);
RyoheiHagimoto 0:0e0631af0305 610 v_b0 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi);
RyoheiHagimoto 0:0e0631af0305 611 v_g0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_lo);
RyoheiHagimoto 0:0e0631af0305 612 v_b1 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_hi);
RyoheiHagimoto 0:0e0631af0305 613 }
RyoheiHagimoto 0:0e0631af0305 614
RyoheiHagimoto 0:0e0631af0305 615 inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1,
RyoheiHagimoto 0:0e0631af0305 616 __m128 & v_b0, __m128 & v_b1, __m128 & v_a0, __m128 & v_a1)
RyoheiHagimoto 0:0e0631af0305 617 {
RyoheiHagimoto 0:0e0631af0305 618 const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
RyoheiHagimoto 0:0e0631af0305 619
RyoheiHagimoto 0:0e0631af0305 620 __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo);
RyoheiHagimoto 0:0e0631af0305 621 __m128 layer2_chunk4 = _mm_shuffle_ps(v_r0, v_r1, mask_hi);
RyoheiHagimoto 0:0e0631af0305 622 __m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo);
RyoheiHagimoto 0:0e0631af0305 623 __m128 layer2_chunk5 = _mm_shuffle_ps(v_g0, v_g1, mask_hi);
RyoheiHagimoto 0:0e0631af0305 624 __m128 layer2_chunk2 = _mm_shuffle_ps(v_b0, v_b1, mask_lo);
RyoheiHagimoto 0:0e0631af0305 625 __m128 layer2_chunk6 = _mm_shuffle_ps(v_b0, v_b1, mask_hi);
RyoheiHagimoto 0:0e0631af0305 626 __m128 layer2_chunk3 = _mm_shuffle_ps(v_a0, v_a1, mask_lo);
RyoheiHagimoto 0:0e0631af0305 627 __m128 layer2_chunk7 = _mm_shuffle_ps(v_a0, v_a1, mask_hi);
RyoheiHagimoto 0:0e0631af0305 628
RyoheiHagimoto 0:0e0631af0305 629 __m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo);
RyoheiHagimoto 0:0e0631af0305 630 __m128 layer1_chunk4 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi);
RyoheiHagimoto 0:0e0631af0305 631 __m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo);
RyoheiHagimoto 0:0e0631af0305 632 __m128 layer1_chunk5 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi);
RyoheiHagimoto 0:0e0631af0305 633 __m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_lo);
RyoheiHagimoto 0:0e0631af0305 634 __m128 layer1_chunk6 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_hi);
RyoheiHagimoto 0:0e0631af0305 635 __m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk6, layer2_chunk7, mask_lo);
RyoheiHagimoto 0:0e0631af0305 636 __m128 layer1_chunk7 = _mm_shuffle_ps(layer2_chunk6, layer2_chunk7, mask_hi);
RyoheiHagimoto 0:0e0631af0305 637
RyoheiHagimoto 0:0e0631af0305 638 v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo);
RyoheiHagimoto 0:0e0631af0305 639 v_b0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi);
RyoheiHagimoto 0:0e0631af0305 640 v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo);
RyoheiHagimoto 0:0e0631af0305 641 v_b1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi);
RyoheiHagimoto 0:0e0631af0305 642 v_g0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_lo);
RyoheiHagimoto 0:0e0631af0305 643 v_a0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_hi);
RyoheiHagimoto 0:0e0631af0305 644 v_g1 = _mm_shuffle_ps(layer1_chunk6, layer1_chunk7, mask_lo);
RyoheiHagimoto 0:0e0631af0305 645 v_a1 = _mm_shuffle_ps(layer1_chunk6, layer1_chunk7, mask_hi);
RyoheiHagimoto 0:0e0631af0305 646 }
RyoheiHagimoto 0:0e0631af0305 647
RyoheiHagimoto 0:0e0631af0305 648 #endif // CV_SSE2
RyoheiHagimoto 0:0e0631af0305 649
RyoheiHagimoto 0:0e0631af0305 650 //! @}
RyoheiHagimoto 0:0e0631af0305 651
RyoheiHagimoto 0:0e0631af0305 652 #endif //OPENCV_CORE_SSE_UTILS_HPP