openCV library for Renesas RZ/A
Dependents: RZ_A2M_Mbed_samples
include/opencv2/core/sse_utils.hpp@0:0e0631af0305, 2021-01-29 (annotated)
- Committer:
- RyoheiHagimoto
- Date:
- Fri Jan 29 04:53:38 2021 +0000
- Revision:
- 0:0e0631af0305
copied from https://github.com/d-kato/opencv-lib.
Who changed what in which revision?
| User | Revision | Line number | New contents of line |
|---|---|---|---|
| RyoheiHagimoto | 0:0e0631af0305 | 1 | /*M/////////////////////////////////////////////////////////////////////////////////////// |
| RyoheiHagimoto | 0:0e0631af0305 | 2 | // |
| RyoheiHagimoto | 0:0e0631af0305 | 3 | // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
| RyoheiHagimoto | 0:0e0631af0305 | 4 | // |
| RyoheiHagimoto | 0:0e0631af0305 | 5 | // By downloading, copying, installing or using the software you agree to this license. |
| RyoheiHagimoto | 0:0e0631af0305 | 6 | // If you do not agree to this license, do not download, install, |
| RyoheiHagimoto | 0:0e0631af0305 | 7 | // copy or use the software. |
| RyoheiHagimoto | 0:0e0631af0305 | 8 | // |
| RyoheiHagimoto | 0:0e0631af0305 | 9 | // |
| RyoheiHagimoto | 0:0e0631af0305 | 10 | // License Agreement |
| RyoheiHagimoto | 0:0e0631af0305 | 11 | // For Open Source Computer Vision Library |
| RyoheiHagimoto | 0:0e0631af0305 | 12 | // |
| RyoheiHagimoto | 0:0e0631af0305 | 13 | // Copyright (C) 2015, Itseez Inc., all rights reserved. |
| RyoheiHagimoto | 0:0e0631af0305 | 14 | // Third party copyrights are property of their respective owners. |
| RyoheiHagimoto | 0:0e0631af0305 | 15 | // |
| RyoheiHagimoto | 0:0e0631af0305 | 16 | // Redistribution and use in source and binary forms, with or without modification, |
| RyoheiHagimoto | 0:0e0631af0305 | 17 | // are permitted provided that the following conditions are met: |
| RyoheiHagimoto | 0:0e0631af0305 | 18 | // |
| RyoheiHagimoto | 0:0e0631af0305 | 19 | // * Redistribution's of source code must retain the above copyright notice, |
| RyoheiHagimoto | 0:0e0631af0305 | 20 | // this list of conditions and the following disclaimer. |
| RyoheiHagimoto | 0:0e0631af0305 | 21 | // |
| RyoheiHagimoto | 0:0e0631af0305 | 22 | // * Redistribution's in binary form must reproduce the above copyright notice, |
| RyoheiHagimoto | 0:0e0631af0305 | 23 | // this list of conditions and the following disclaimer in the documentation |
| RyoheiHagimoto | 0:0e0631af0305 | 24 | // and/or other materials provided with the distribution. |
| RyoheiHagimoto | 0:0e0631af0305 | 25 | // |
| RyoheiHagimoto | 0:0e0631af0305 | 26 | // * The name of the copyright holders may not be used to endorse or promote products |
| RyoheiHagimoto | 0:0e0631af0305 | 27 | // derived from this software without specific prior written permission. |
| RyoheiHagimoto | 0:0e0631af0305 | 28 | // |
| RyoheiHagimoto | 0:0e0631af0305 | 29 | // This software is provided by the copyright holders and contributors "as is" and |
| RyoheiHagimoto | 0:0e0631af0305 | 30 | // any express or implied warranties, including, but not limited to, the implied |
| RyoheiHagimoto | 0:0e0631af0305 | 31 | // warranties of merchantability and fitness for a particular purpose are disclaimed. |
| RyoheiHagimoto | 0:0e0631af0305 | 32 | // In no event shall the Intel Corporation or contributors be liable for any direct, |
| RyoheiHagimoto | 0:0e0631af0305 | 33 | // indirect, incidental, special, exemplary, or consequential damages |
| RyoheiHagimoto | 0:0e0631af0305 | 34 | // (including, but not limited to, procurement of substitute goods or services; |
| RyoheiHagimoto | 0:0e0631af0305 | 35 | // loss of use, data, or profits; or business interruption) however caused |
| RyoheiHagimoto | 0:0e0631af0305 | 36 | // and on any theory of liability, whether in contract, strict liability, |
| RyoheiHagimoto | 0:0e0631af0305 | 37 | // or tort (including negligence or otherwise) arising in any way out of |
| RyoheiHagimoto | 0:0e0631af0305 | 38 | // the use of this software, even if advised of the possibility of such damage. |
| RyoheiHagimoto | 0:0e0631af0305 | 39 | // |
| RyoheiHagimoto | 0:0e0631af0305 | 40 | //M*/ |
| RyoheiHagimoto | 0:0e0631af0305 | 41 | |
| RyoheiHagimoto | 0:0e0631af0305 | 42 | #ifndef OPENCV_CORE_SSE_UTILS_HPP |
| RyoheiHagimoto | 0:0e0631af0305 | 43 | #define OPENCV_CORE_SSE_UTILS_HPP |
| RyoheiHagimoto | 0:0e0631af0305 | 44 | |
| RyoheiHagimoto | 0:0e0631af0305 | 45 | #ifndef __cplusplus |
| RyoheiHagimoto | 0:0e0631af0305 | 46 | # error sse_utils.hpp header must be compiled as C++ |
| RyoheiHagimoto | 0:0e0631af0305 | 47 | #endif |
| RyoheiHagimoto | 0:0e0631af0305 | 48 | |
| RyoheiHagimoto | 0:0e0631af0305 | 49 | #include "opencv2/core/cvdef.h" |
| RyoheiHagimoto | 0:0e0631af0305 | 50 | |
| RyoheiHagimoto | 0:0e0631af0305 | 51 | //! @addtogroup core_utils_sse |
| RyoheiHagimoto | 0:0e0631af0305 | 52 | //! @{ |
| RyoheiHagimoto | 0:0e0631af0305 | 53 | |
| RyoheiHagimoto | 0:0e0631af0305 | 54 | #if CV_SSE2 |
| RyoheiHagimoto | 0:0e0631af0305 | 55 | |
| RyoheiHagimoto | 0:0e0631af0305 | 56 | inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1) |
| RyoheiHagimoto | 0:0e0631af0305 | 57 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 58 | __m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_g0); |
| RyoheiHagimoto | 0:0e0631af0305 | 59 | __m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_g0); |
| RyoheiHagimoto | 0:0e0631af0305 | 60 | __m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_g1); |
| RyoheiHagimoto | 0:0e0631af0305 | 61 | __m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_g1); |
| RyoheiHagimoto | 0:0e0631af0305 | 62 | |
| RyoheiHagimoto | 0:0e0631af0305 | 63 | __m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk2); |
| RyoheiHagimoto | 0:0e0631af0305 | 64 | __m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk2); |
| RyoheiHagimoto | 0:0e0631af0305 | 65 | __m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk3); |
| RyoheiHagimoto | 0:0e0631af0305 | 66 | __m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk3); |
| RyoheiHagimoto | 0:0e0631af0305 | 67 | |
| RyoheiHagimoto | 0:0e0631af0305 | 68 | __m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk2); |
| RyoheiHagimoto | 0:0e0631af0305 | 69 | __m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk2); |
| RyoheiHagimoto | 0:0e0631af0305 | 70 | __m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk3); |
| RyoheiHagimoto | 0:0e0631af0305 | 71 | __m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk3); |
| RyoheiHagimoto | 0:0e0631af0305 | 72 | |
| RyoheiHagimoto | 0:0e0631af0305 | 73 | __m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk2); |
| RyoheiHagimoto | 0:0e0631af0305 | 74 | __m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk2); |
| RyoheiHagimoto | 0:0e0631af0305 | 75 | __m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk3); |
| RyoheiHagimoto | 0:0e0631af0305 | 76 | __m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk3); |
| RyoheiHagimoto | 0:0e0631af0305 | 77 | |
| RyoheiHagimoto | 0:0e0631af0305 | 78 | v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk2); |
| RyoheiHagimoto | 0:0e0631af0305 | 79 | v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk2); |
| RyoheiHagimoto | 0:0e0631af0305 | 80 | v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk3); |
| RyoheiHagimoto | 0:0e0631af0305 | 81 | v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk3); |
| RyoheiHagimoto | 0:0e0631af0305 | 82 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 83 | |
| RyoheiHagimoto | 0:0e0631af0305 | 84 | inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, |
| RyoheiHagimoto | 0:0e0631af0305 | 85 | __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) |
| RyoheiHagimoto | 0:0e0631af0305 | 86 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 87 | __m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_g1); |
| RyoheiHagimoto | 0:0e0631af0305 | 88 | __m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_g1); |
| RyoheiHagimoto | 0:0e0631af0305 | 89 | __m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_b0); |
| RyoheiHagimoto | 0:0e0631af0305 | 90 | __m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_b0); |
| RyoheiHagimoto | 0:0e0631af0305 | 91 | __m128i layer1_chunk4 = _mm_unpacklo_epi8(v_g0, v_b1); |
| RyoheiHagimoto | 0:0e0631af0305 | 92 | __m128i layer1_chunk5 = _mm_unpackhi_epi8(v_g0, v_b1); |
| RyoheiHagimoto | 0:0e0631af0305 | 93 | |
| RyoheiHagimoto | 0:0e0631af0305 | 94 | __m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk3); |
| RyoheiHagimoto | 0:0e0631af0305 | 95 | __m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk3); |
| RyoheiHagimoto | 0:0e0631af0305 | 96 | __m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk4); |
| RyoheiHagimoto | 0:0e0631af0305 | 97 | __m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk4); |
| RyoheiHagimoto | 0:0e0631af0305 | 98 | __m128i layer2_chunk4 = _mm_unpacklo_epi8(layer1_chunk2, layer1_chunk5); |
| RyoheiHagimoto | 0:0e0631af0305 | 99 | __m128i layer2_chunk5 = _mm_unpackhi_epi8(layer1_chunk2, layer1_chunk5); |
| RyoheiHagimoto | 0:0e0631af0305 | 100 | |
| RyoheiHagimoto | 0:0e0631af0305 | 101 | __m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk3); |
| RyoheiHagimoto | 0:0e0631af0305 | 102 | __m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk3); |
| RyoheiHagimoto | 0:0e0631af0305 | 103 | __m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk4); |
| RyoheiHagimoto | 0:0e0631af0305 | 104 | __m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk4); |
| RyoheiHagimoto | 0:0e0631af0305 | 105 | __m128i layer3_chunk4 = _mm_unpacklo_epi8(layer2_chunk2, layer2_chunk5); |
| RyoheiHagimoto | 0:0e0631af0305 | 106 | __m128i layer3_chunk5 = _mm_unpackhi_epi8(layer2_chunk2, layer2_chunk5); |
| RyoheiHagimoto | 0:0e0631af0305 | 107 | |
| RyoheiHagimoto | 0:0e0631af0305 | 108 | __m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk3); |
| RyoheiHagimoto | 0:0e0631af0305 | 109 | __m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk3); |
| RyoheiHagimoto | 0:0e0631af0305 | 110 | __m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk4); |
| RyoheiHagimoto | 0:0e0631af0305 | 111 | __m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk4); |
| RyoheiHagimoto | 0:0e0631af0305 | 112 | __m128i layer4_chunk4 = _mm_unpacklo_epi8(layer3_chunk2, layer3_chunk5); |
| RyoheiHagimoto | 0:0e0631af0305 | 113 | __m128i layer4_chunk5 = _mm_unpackhi_epi8(layer3_chunk2, layer3_chunk5); |
| RyoheiHagimoto | 0:0e0631af0305 | 114 | |
| RyoheiHagimoto | 0:0e0631af0305 | 115 | v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk3); |
| RyoheiHagimoto | 0:0e0631af0305 | 116 | v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk3); |
| RyoheiHagimoto | 0:0e0631af0305 | 117 | v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk4); |
| RyoheiHagimoto | 0:0e0631af0305 | 118 | v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk4); |
| RyoheiHagimoto | 0:0e0631af0305 | 119 | v_b0 = _mm_unpacklo_epi8(layer4_chunk2, layer4_chunk5); |
| RyoheiHagimoto | 0:0e0631af0305 | 120 | v_b1 = _mm_unpackhi_epi8(layer4_chunk2, layer4_chunk5); |
| RyoheiHagimoto | 0:0e0631af0305 | 121 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 122 | |
| RyoheiHagimoto | 0:0e0631af0305 | 123 | inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, |
| RyoheiHagimoto | 0:0e0631af0305 | 124 | __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1) |
| RyoheiHagimoto | 0:0e0631af0305 | 125 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 126 | __m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_b0); |
| RyoheiHagimoto | 0:0e0631af0305 | 127 | __m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_b0); |
| RyoheiHagimoto | 0:0e0631af0305 | 128 | __m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_b1); |
| RyoheiHagimoto | 0:0e0631af0305 | 129 | __m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_b1); |
| RyoheiHagimoto | 0:0e0631af0305 | 130 | __m128i layer1_chunk4 = _mm_unpacklo_epi8(v_g0, v_a0); |
| RyoheiHagimoto | 0:0e0631af0305 | 131 | __m128i layer1_chunk5 = _mm_unpackhi_epi8(v_g0, v_a0); |
| RyoheiHagimoto | 0:0e0631af0305 | 132 | __m128i layer1_chunk6 = _mm_unpacklo_epi8(v_g1, v_a1); |
| RyoheiHagimoto | 0:0e0631af0305 | 133 | __m128i layer1_chunk7 = _mm_unpackhi_epi8(v_g1, v_a1); |
| RyoheiHagimoto | 0:0e0631af0305 | 134 | |
| RyoheiHagimoto | 0:0e0631af0305 | 135 | __m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk4); |
| RyoheiHagimoto | 0:0e0631af0305 | 136 | __m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk4); |
| RyoheiHagimoto | 0:0e0631af0305 | 137 | __m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk5); |
| RyoheiHagimoto | 0:0e0631af0305 | 138 | __m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk5); |
| RyoheiHagimoto | 0:0e0631af0305 | 139 | __m128i layer2_chunk4 = _mm_unpacklo_epi8(layer1_chunk2, layer1_chunk6); |
| RyoheiHagimoto | 0:0e0631af0305 | 140 | __m128i layer2_chunk5 = _mm_unpackhi_epi8(layer1_chunk2, layer1_chunk6); |
| RyoheiHagimoto | 0:0e0631af0305 | 141 | __m128i layer2_chunk6 = _mm_unpacklo_epi8(layer1_chunk3, layer1_chunk7); |
| RyoheiHagimoto | 0:0e0631af0305 | 142 | __m128i layer2_chunk7 = _mm_unpackhi_epi8(layer1_chunk3, layer1_chunk7); |
| RyoheiHagimoto | 0:0e0631af0305 | 143 | |
| RyoheiHagimoto | 0:0e0631af0305 | 144 | __m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk4); |
| RyoheiHagimoto | 0:0e0631af0305 | 145 | __m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk4); |
| RyoheiHagimoto | 0:0e0631af0305 | 146 | __m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk5); |
| RyoheiHagimoto | 0:0e0631af0305 | 147 | __m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk5); |
| RyoheiHagimoto | 0:0e0631af0305 | 148 | __m128i layer3_chunk4 = _mm_unpacklo_epi8(layer2_chunk2, layer2_chunk6); |
| RyoheiHagimoto | 0:0e0631af0305 | 149 | __m128i layer3_chunk5 = _mm_unpackhi_epi8(layer2_chunk2, layer2_chunk6); |
| RyoheiHagimoto | 0:0e0631af0305 | 150 | __m128i layer3_chunk6 = _mm_unpacklo_epi8(layer2_chunk3, layer2_chunk7); |
| RyoheiHagimoto | 0:0e0631af0305 | 151 | __m128i layer3_chunk7 = _mm_unpackhi_epi8(layer2_chunk3, layer2_chunk7); |
| RyoheiHagimoto | 0:0e0631af0305 | 152 | |
| RyoheiHagimoto | 0:0e0631af0305 | 153 | __m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk4); |
| RyoheiHagimoto | 0:0e0631af0305 | 154 | __m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk4); |
| RyoheiHagimoto | 0:0e0631af0305 | 155 | __m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk5); |
| RyoheiHagimoto | 0:0e0631af0305 | 156 | __m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk5); |
| RyoheiHagimoto | 0:0e0631af0305 | 157 | __m128i layer4_chunk4 = _mm_unpacklo_epi8(layer3_chunk2, layer3_chunk6); |
| RyoheiHagimoto | 0:0e0631af0305 | 158 | __m128i layer4_chunk5 = _mm_unpackhi_epi8(layer3_chunk2, layer3_chunk6); |
| RyoheiHagimoto | 0:0e0631af0305 | 159 | __m128i layer4_chunk6 = _mm_unpacklo_epi8(layer3_chunk3, layer3_chunk7); |
| RyoheiHagimoto | 0:0e0631af0305 | 160 | __m128i layer4_chunk7 = _mm_unpackhi_epi8(layer3_chunk3, layer3_chunk7); |
| RyoheiHagimoto | 0:0e0631af0305 | 161 | |
| RyoheiHagimoto | 0:0e0631af0305 | 162 | v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk4); |
| RyoheiHagimoto | 0:0e0631af0305 | 163 | v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk4); |
| RyoheiHagimoto | 0:0e0631af0305 | 164 | v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk5); |
| RyoheiHagimoto | 0:0e0631af0305 | 165 | v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk5); |
| RyoheiHagimoto | 0:0e0631af0305 | 166 | v_b0 = _mm_unpacklo_epi8(layer4_chunk2, layer4_chunk6); |
| RyoheiHagimoto | 0:0e0631af0305 | 167 | v_b1 = _mm_unpackhi_epi8(layer4_chunk2, layer4_chunk6); |
| RyoheiHagimoto | 0:0e0631af0305 | 168 | v_a0 = _mm_unpacklo_epi8(layer4_chunk3, layer4_chunk7); |
| RyoheiHagimoto | 0:0e0631af0305 | 169 | v_a1 = _mm_unpackhi_epi8(layer4_chunk3, layer4_chunk7); |
| RyoheiHagimoto | 0:0e0631af0305 | 170 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 171 | |
| RyoheiHagimoto | 0:0e0631af0305 | 172 | inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1) |
| RyoheiHagimoto | 0:0e0631af0305 | 173 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 174 | __m128i v_mask = _mm_set1_epi16(0x00ff); |
| RyoheiHagimoto | 0:0e0631af0305 | 175 | |
| RyoheiHagimoto | 0:0e0631af0305 | 176 | __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 177 | __m128i layer4_chunk2 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8)); |
| RyoheiHagimoto | 0:0e0631af0305 | 178 | __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 179 | __m128i layer4_chunk3 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8)); |
| RyoheiHagimoto | 0:0e0631af0305 | 180 | |
| RyoheiHagimoto | 0:0e0631af0305 | 181 | __m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 182 | __m128i layer3_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8)); |
| RyoheiHagimoto | 0:0e0631af0305 | 183 | __m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 184 | __m128i layer3_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8)); |
| RyoheiHagimoto | 0:0e0631af0305 | 185 | |
| RyoheiHagimoto | 0:0e0631af0305 | 186 | __m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 187 | __m128i layer2_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8)); |
| RyoheiHagimoto | 0:0e0631af0305 | 188 | __m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 189 | __m128i layer2_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8)); |
| RyoheiHagimoto | 0:0e0631af0305 | 190 | |
| RyoheiHagimoto | 0:0e0631af0305 | 191 | __m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 192 | __m128i layer1_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8)); |
| RyoheiHagimoto | 0:0e0631af0305 | 193 | __m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 194 | __m128i layer1_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8)); |
| RyoheiHagimoto | 0:0e0631af0305 | 195 | |
| RyoheiHagimoto | 0:0e0631af0305 | 196 | v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 197 | v_g0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8)); |
| RyoheiHagimoto | 0:0e0631af0305 | 198 | v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 199 | v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8)); |
| RyoheiHagimoto | 0:0e0631af0305 | 200 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 201 | |
| RyoheiHagimoto | 0:0e0631af0305 | 202 | inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, |
| RyoheiHagimoto | 0:0e0631af0305 | 203 | __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) |
| RyoheiHagimoto | 0:0e0631af0305 | 204 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 205 | __m128i v_mask = _mm_set1_epi16(0x00ff); |
| RyoheiHagimoto | 0:0e0631af0305 | 206 | |
| RyoheiHagimoto | 0:0e0631af0305 | 207 | __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 208 | __m128i layer4_chunk3 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8)); |
| RyoheiHagimoto | 0:0e0631af0305 | 209 | __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 210 | __m128i layer4_chunk4 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8)); |
| RyoheiHagimoto | 0:0e0631af0305 | 211 | __m128i layer4_chunk2 = _mm_packus_epi16(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 212 | __m128i layer4_chunk5 = _mm_packus_epi16(_mm_srli_epi16(v_b0, 8), _mm_srli_epi16(v_b1, 8)); |
| RyoheiHagimoto | 0:0e0631af0305 | 213 | |
| RyoheiHagimoto | 0:0e0631af0305 | 214 | __m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 215 | __m128i layer3_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8)); |
| RyoheiHagimoto | 0:0e0631af0305 | 216 | __m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 217 | __m128i layer3_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8)); |
| RyoheiHagimoto | 0:0e0631af0305 | 218 | __m128i layer3_chunk2 = _mm_packus_epi16(_mm_and_si128(layer4_chunk4, v_mask), _mm_and_si128(layer4_chunk5, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 219 | __m128i layer3_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk4, 8), _mm_srli_epi16(layer4_chunk5, 8)); |
| RyoheiHagimoto | 0:0e0631af0305 | 220 | |
| RyoheiHagimoto | 0:0e0631af0305 | 221 | __m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 222 | __m128i layer2_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8)); |
| RyoheiHagimoto | 0:0e0631af0305 | 223 | __m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 224 | __m128i layer2_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8)); |
| RyoheiHagimoto | 0:0e0631af0305 | 225 | __m128i layer2_chunk2 = _mm_packus_epi16(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 226 | __m128i layer2_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk4, 8), _mm_srli_epi16(layer3_chunk5, 8)); |
| RyoheiHagimoto | 0:0e0631af0305 | 227 | |
| RyoheiHagimoto | 0:0e0631af0305 | 228 | __m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 229 | __m128i layer1_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8)); |
| RyoheiHagimoto | 0:0e0631af0305 | 230 | __m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 231 | __m128i layer1_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8)); |
| RyoheiHagimoto | 0:0e0631af0305 | 232 | __m128i layer1_chunk2 = _mm_packus_epi16(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 233 | __m128i layer1_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk4, 8), _mm_srli_epi16(layer2_chunk5, 8)); |
| RyoheiHagimoto | 0:0e0631af0305 | 234 | |
| RyoheiHagimoto | 0:0e0631af0305 | 235 | v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 236 | v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8)); |
| RyoheiHagimoto | 0:0e0631af0305 | 237 | v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 238 | v_b0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8)); |
| RyoheiHagimoto | 0:0e0631af0305 | 239 | v_g0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 240 | v_b1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk4, 8), _mm_srli_epi16(layer1_chunk5, 8)); |
| RyoheiHagimoto | 0:0e0631af0305 | 241 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 242 | |
| RyoheiHagimoto | 0:0e0631af0305 | 243 | inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, |
| RyoheiHagimoto | 0:0e0631af0305 | 244 | __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1) |
| RyoheiHagimoto | 0:0e0631af0305 | 245 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 246 | __m128i v_mask = _mm_set1_epi16(0x00ff); |
| RyoheiHagimoto | 0:0e0631af0305 | 247 | |
| RyoheiHagimoto | 0:0e0631af0305 | 248 | __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 249 | __m128i layer4_chunk4 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8)); |
| RyoheiHagimoto | 0:0e0631af0305 | 250 | __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 251 | __m128i layer4_chunk5 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8)); |
| RyoheiHagimoto | 0:0e0631af0305 | 252 | __m128i layer4_chunk2 = _mm_packus_epi16(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 253 | __m128i layer4_chunk6 = _mm_packus_epi16(_mm_srli_epi16(v_b0, 8), _mm_srli_epi16(v_b1, 8)); |
| RyoheiHagimoto | 0:0e0631af0305 | 254 | __m128i layer4_chunk3 = _mm_packus_epi16(_mm_and_si128(v_a0, v_mask), _mm_and_si128(v_a1, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 255 | __m128i layer4_chunk7 = _mm_packus_epi16(_mm_srli_epi16(v_a0, 8), _mm_srli_epi16(v_a1, 8)); |
| RyoheiHagimoto | 0:0e0631af0305 | 256 | |
| RyoheiHagimoto | 0:0e0631af0305 | 257 | __m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 258 | __m128i layer3_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8)); |
| RyoheiHagimoto | 0:0e0631af0305 | 259 | __m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 260 | __m128i layer3_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8)); |
| RyoheiHagimoto | 0:0e0631af0305 | 261 | __m128i layer3_chunk2 = _mm_packus_epi16(_mm_and_si128(layer4_chunk4, v_mask), _mm_and_si128(layer4_chunk5, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 262 | __m128i layer3_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk4, 8), _mm_srli_epi16(layer4_chunk5, 8)); |
| RyoheiHagimoto | 0:0e0631af0305 | 263 | __m128i layer3_chunk3 = _mm_packus_epi16(_mm_and_si128(layer4_chunk6, v_mask), _mm_and_si128(layer4_chunk7, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 264 | __m128i layer3_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk6, 8), _mm_srli_epi16(layer4_chunk7, 8)); |
| RyoheiHagimoto | 0:0e0631af0305 | 265 | |
| RyoheiHagimoto | 0:0e0631af0305 | 266 | __m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 267 | __m128i layer2_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8)); |
| RyoheiHagimoto | 0:0e0631af0305 | 268 | __m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 269 | __m128i layer2_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8)); |
| RyoheiHagimoto | 0:0e0631af0305 | 270 | __m128i layer2_chunk2 = _mm_packus_epi16(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 271 | __m128i layer2_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk4, 8), _mm_srli_epi16(layer3_chunk5, 8)); |
| RyoheiHagimoto | 0:0e0631af0305 | 272 | __m128i layer2_chunk3 = _mm_packus_epi16(_mm_and_si128(layer3_chunk6, v_mask), _mm_and_si128(layer3_chunk7, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 273 | __m128i layer2_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk6, 8), _mm_srli_epi16(layer3_chunk7, 8)); |
| RyoheiHagimoto | 0:0e0631af0305 | 274 | |
| RyoheiHagimoto | 0:0e0631af0305 | 275 | __m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 276 | __m128i layer1_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8)); |
| RyoheiHagimoto | 0:0e0631af0305 | 277 | __m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 278 | __m128i layer1_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8)); |
| RyoheiHagimoto | 0:0e0631af0305 | 279 | __m128i layer1_chunk2 = _mm_packus_epi16(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 280 | __m128i layer1_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk4, 8), _mm_srli_epi16(layer2_chunk5, 8)); |
| RyoheiHagimoto | 0:0e0631af0305 | 281 | __m128i layer1_chunk3 = _mm_packus_epi16(_mm_and_si128(layer2_chunk6, v_mask), _mm_and_si128(layer2_chunk7, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 282 | __m128i layer1_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk6, 8), _mm_srli_epi16(layer2_chunk7, 8)); |
| RyoheiHagimoto | 0:0e0631af0305 | 283 | |
| RyoheiHagimoto | 0:0e0631af0305 | 284 | v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 285 | v_b0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8)); |
| RyoheiHagimoto | 0:0e0631af0305 | 286 | v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 287 | v_b1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8)); |
| RyoheiHagimoto | 0:0e0631af0305 | 288 | v_g0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 289 | v_a0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk4, 8), _mm_srli_epi16(layer1_chunk5, 8)); |
| RyoheiHagimoto | 0:0e0631af0305 | 290 | v_g1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk6, v_mask), _mm_and_si128(layer1_chunk7, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 291 | v_a1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk6, 8), _mm_srli_epi16(layer1_chunk7, 8)); |
| RyoheiHagimoto | 0:0e0631af0305 | 292 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 293 | |
| RyoheiHagimoto | 0:0e0631af0305 | 294 | inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1) |
| RyoheiHagimoto | 0:0e0631af0305 | 295 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 296 | __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_g0); |
| RyoheiHagimoto | 0:0e0631af0305 | 297 | __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_g0); |
| RyoheiHagimoto | 0:0e0631af0305 | 298 | __m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_g1); |
| RyoheiHagimoto | 0:0e0631af0305 | 299 | __m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_g1); |
| RyoheiHagimoto | 0:0e0631af0305 | 300 | |
| RyoheiHagimoto | 0:0e0631af0305 | 301 | __m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk2); |
| RyoheiHagimoto | 0:0e0631af0305 | 302 | __m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk2); |
| RyoheiHagimoto | 0:0e0631af0305 | 303 | __m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk3); |
| RyoheiHagimoto | 0:0e0631af0305 | 304 | __m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk3); |
| RyoheiHagimoto | 0:0e0631af0305 | 305 | |
| RyoheiHagimoto | 0:0e0631af0305 | 306 | __m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk2); |
| RyoheiHagimoto | 0:0e0631af0305 | 307 | __m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk2); |
| RyoheiHagimoto | 0:0e0631af0305 | 308 | __m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk3); |
| RyoheiHagimoto | 0:0e0631af0305 | 309 | __m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk3); |
| RyoheiHagimoto | 0:0e0631af0305 | 310 | |
| RyoheiHagimoto | 0:0e0631af0305 | 311 | v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk2); |
| RyoheiHagimoto | 0:0e0631af0305 | 312 | v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk2); |
| RyoheiHagimoto | 0:0e0631af0305 | 313 | v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk3); |
| RyoheiHagimoto | 0:0e0631af0305 | 314 | v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk3); |
| RyoheiHagimoto | 0:0e0631af0305 | 315 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 316 | |
| RyoheiHagimoto | 0:0e0631af0305 | 317 | inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, |
| RyoheiHagimoto | 0:0e0631af0305 | 318 | __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) |
| RyoheiHagimoto | 0:0e0631af0305 | 319 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 320 | __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_g1); |
| RyoheiHagimoto | 0:0e0631af0305 | 321 | __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_g1); |
| RyoheiHagimoto | 0:0e0631af0305 | 322 | __m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_b0); |
| RyoheiHagimoto | 0:0e0631af0305 | 323 | __m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_b0); |
| RyoheiHagimoto | 0:0e0631af0305 | 324 | __m128i layer1_chunk4 = _mm_unpacklo_epi16(v_g0, v_b1); |
| RyoheiHagimoto | 0:0e0631af0305 | 325 | __m128i layer1_chunk5 = _mm_unpackhi_epi16(v_g0, v_b1); |
| RyoheiHagimoto | 0:0e0631af0305 | 326 | |
| RyoheiHagimoto | 0:0e0631af0305 | 327 | __m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk3); |
| RyoheiHagimoto | 0:0e0631af0305 | 328 | __m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk3); |
| RyoheiHagimoto | 0:0e0631af0305 | 329 | __m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk4); |
| RyoheiHagimoto | 0:0e0631af0305 | 330 | __m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk4); |
| RyoheiHagimoto | 0:0e0631af0305 | 331 | __m128i layer2_chunk4 = _mm_unpacklo_epi16(layer1_chunk2, layer1_chunk5); |
| RyoheiHagimoto | 0:0e0631af0305 | 332 | __m128i layer2_chunk5 = _mm_unpackhi_epi16(layer1_chunk2, layer1_chunk5); |
| RyoheiHagimoto | 0:0e0631af0305 | 333 | |
| RyoheiHagimoto | 0:0e0631af0305 | 334 | __m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk3); |
| RyoheiHagimoto | 0:0e0631af0305 | 335 | __m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk3); |
| RyoheiHagimoto | 0:0e0631af0305 | 336 | __m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk4); |
| RyoheiHagimoto | 0:0e0631af0305 | 337 | __m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk4); |
| RyoheiHagimoto | 0:0e0631af0305 | 338 | __m128i layer3_chunk4 = _mm_unpacklo_epi16(layer2_chunk2, layer2_chunk5); |
| RyoheiHagimoto | 0:0e0631af0305 | 339 | __m128i layer3_chunk5 = _mm_unpackhi_epi16(layer2_chunk2, layer2_chunk5); |
| RyoheiHagimoto | 0:0e0631af0305 | 340 | |
| RyoheiHagimoto | 0:0e0631af0305 | 341 | v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk3); |
| RyoheiHagimoto | 0:0e0631af0305 | 342 | v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk3); |
| RyoheiHagimoto | 0:0e0631af0305 | 343 | v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk4); |
| RyoheiHagimoto | 0:0e0631af0305 | 344 | v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk4); |
| RyoheiHagimoto | 0:0e0631af0305 | 345 | v_b0 = _mm_unpacklo_epi16(layer3_chunk2, layer3_chunk5); |
| RyoheiHagimoto | 0:0e0631af0305 | 346 | v_b1 = _mm_unpackhi_epi16(layer3_chunk2, layer3_chunk5); |
| RyoheiHagimoto | 0:0e0631af0305 | 347 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 348 | |
| RyoheiHagimoto | 0:0e0631af0305 | 349 | inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, |
| RyoheiHagimoto | 0:0e0631af0305 | 350 | __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1) |
| RyoheiHagimoto | 0:0e0631af0305 | 351 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 352 | __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_b0); |
| RyoheiHagimoto | 0:0e0631af0305 | 353 | __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_b0); |
| RyoheiHagimoto | 0:0e0631af0305 | 354 | __m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_b1); |
| RyoheiHagimoto | 0:0e0631af0305 | 355 | __m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_b1); |
| RyoheiHagimoto | 0:0e0631af0305 | 356 | __m128i layer1_chunk4 = _mm_unpacklo_epi16(v_g0, v_a0); |
| RyoheiHagimoto | 0:0e0631af0305 | 357 | __m128i layer1_chunk5 = _mm_unpackhi_epi16(v_g0, v_a0); |
| RyoheiHagimoto | 0:0e0631af0305 | 358 | __m128i layer1_chunk6 = _mm_unpacklo_epi16(v_g1, v_a1); |
| RyoheiHagimoto | 0:0e0631af0305 | 359 | __m128i layer1_chunk7 = _mm_unpackhi_epi16(v_g1, v_a1); |
| RyoheiHagimoto | 0:0e0631af0305 | 360 | |
| RyoheiHagimoto | 0:0e0631af0305 | 361 | __m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk4); |
| RyoheiHagimoto | 0:0e0631af0305 | 362 | __m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk4); |
| RyoheiHagimoto | 0:0e0631af0305 | 363 | __m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk5); |
| RyoheiHagimoto | 0:0e0631af0305 | 364 | __m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk5); |
| RyoheiHagimoto | 0:0e0631af0305 | 365 | __m128i layer2_chunk4 = _mm_unpacklo_epi16(layer1_chunk2, layer1_chunk6); |
| RyoheiHagimoto | 0:0e0631af0305 | 366 | __m128i layer2_chunk5 = _mm_unpackhi_epi16(layer1_chunk2, layer1_chunk6); |
| RyoheiHagimoto | 0:0e0631af0305 | 367 | __m128i layer2_chunk6 = _mm_unpacklo_epi16(layer1_chunk3, layer1_chunk7); |
| RyoheiHagimoto | 0:0e0631af0305 | 368 | __m128i layer2_chunk7 = _mm_unpackhi_epi16(layer1_chunk3, layer1_chunk7); |
| RyoheiHagimoto | 0:0e0631af0305 | 369 | |
| RyoheiHagimoto | 0:0e0631af0305 | 370 | __m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk4); |
| RyoheiHagimoto | 0:0e0631af0305 | 371 | __m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk4); |
| RyoheiHagimoto | 0:0e0631af0305 | 372 | __m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk5); |
| RyoheiHagimoto | 0:0e0631af0305 | 373 | __m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk5); |
| RyoheiHagimoto | 0:0e0631af0305 | 374 | __m128i layer3_chunk4 = _mm_unpacklo_epi16(layer2_chunk2, layer2_chunk6); |
| RyoheiHagimoto | 0:0e0631af0305 | 375 | __m128i layer3_chunk5 = _mm_unpackhi_epi16(layer2_chunk2, layer2_chunk6); |
| RyoheiHagimoto | 0:0e0631af0305 | 376 | __m128i layer3_chunk6 = _mm_unpacklo_epi16(layer2_chunk3, layer2_chunk7); |
| RyoheiHagimoto | 0:0e0631af0305 | 377 | __m128i layer3_chunk7 = _mm_unpackhi_epi16(layer2_chunk3, layer2_chunk7); |
| RyoheiHagimoto | 0:0e0631af0305 | 378 | |
| RyoheiHagimoto | 0:0e0631af0305 | 379 | v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk4); |
| RyoheiHagimoto | 0:0e0631af0305 | 380 | v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk4); |
| RyoheiHagimoto | 0:0e0631af0305 | 381 | v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk5); |
| RyoheiHagimoto | 0:0e0631af0305 | 382 | v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk5); |
| RyoheiHagimoto | 0:0e0631af0305 | 383 | v_b0 = _mm_unpacklo_epi16(layer3_chunk2, layer3_chunk6); |
| RyoheiHagimoto | 0:0e0631af0305 | 384 | v_b1 = _mm_unpackhi_epi16(layer3_chunk2, layer3_chunk6); |
| RyoheiHagimoto | 0:0e0631af0305 | 385 | v_a0 = _mm_unpacklo_epi16(layer3_chunk3, layer3_chunk7); |
| RyoheiHagimoto | 0:0e0631af0305 | 386 | v_a1 = _mm_unpackhi_epi16(layer3_chunk3, layer3_chunk7); |
| RyoheiHagimoto | 0:0e0631af0305 | 387 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 388 | |
| RyoheiHagimoto | 0:0e0631af0305 | 389 | #if CV_SSE4_1 |
| RyoheiHagimoto | 0:0e0631af0305 | 390 | |
| RyoheiHagimoto | 0:0e0631af0305 | 391 | inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1) |
| RyoheiHagimoto | 0:0e0631af0305 | 392 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 393 | __m128i v_mask = _mm_set1_epi32(0x0000ffff); |
| RyoheiHagimoto | 0:0e0631af0305 | 394 | |
| RyoheiHagimoto | 0:0e0631af0305 | 395 | __m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 396 | __m128i layer3_chunk2 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16)); |
| RyoheiHagimoto | 0:0e0631af0305 | 397 | __m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 398 | __m128i layer3_chunk3 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16)); |
| RyoheiHagimoto | 0:0e0631af0305 | 399 | |
| RyoheiHagimoto | 0:0e0631af0305 | 400 | __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 401 | __m128i layer2_chunk2 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16)); |
| RyoheiHagimoto | 0:0e0631af0305 | 402 | __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 403 | __m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16)); |
| RyoheiHagimoto | 0:0e0631af0305 | 404 | |
| RyoheiHagimoto | 0:0e0631af0305 | 405 | __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 406 | __m128i layer1_chunk2 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16)); |
| RyoheiHagimoto | 0:0e0631af0305 | 407 | __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 408 | __m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16)); |
| RyoheiHagimoto | 0:0e0631af0305 | 409 | |
| RyoheiHagimoto | 0:0e0631af0305 | 410 | v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 411 | v_g0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16)); |
| RyoheiHagimoto | 0:0e0631af0305 | 412 | v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 413 | v_g1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16)); |
| RyoheiHagimoto | 0:0e0631af0305 | 414 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 415 | |
| RyoheiHagimoto | 0:0e0631af0305 | 416 | inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, |
| RyoheiHagimoto | 0:0e0631af0305 | 417 | __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) |
| RyoheiHagimoto | 0:0e0631af0305 | 418 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 419 | __m128i v_mask = _mm_set1_epi32(0x0000ffff); |
| RyoheiHagimoto | 0:0e0631af0305 | 420 | |
| RyoheiHagimoto | 0:0e0631af0305 | 421 | __m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 422 | __m128i layer3_chunk3 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16)); |
| RyoheiHagimoto | 0:0e0631af0305 | 423 | __m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 424 | __m128i layer3_chunk4 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16)); |
| RyoheiHagimoto | 0:0e0631af0305 | 425 | __m128i layer3_chunk2 = _mm_packus_epi32(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 426 | __m128i layer3_chunk5 = _mm_packus_epi32(_mm_srli_epi32(v_b0, 16), _mm_srli_epi32(v_b1, 16)); |
| RyoheiHagimoto | 0:0e0631af0305 | 427 | |
| RyoheiHagimoto | 0:0e0631af0305 | 428 | __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 429 | __m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16)); |
| RyoheiHagimoto | 0:0e0631af0305 | 430 | __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 431 | __m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16)); |
| RyoheiHagimoto | 0:0e0631af0305 | 432 | __m128i layer2_chunk2 = _mm_packus_epi32(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 433 | __m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16)); |
| RyoheiHagimoto | 0:0e0631af0305 | 434 | |
| RyoheiHagimoto | 0:0e0631af0305 | 435 | __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 436 | __m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16)); |
| RyoheiHagimoto | 0:0e0631af0305 | 437 | __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 438 | __m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16)); |
| RyoheiHagimoto | 0:0e0631af0305 | 439 | __m128i layer1_chunk2 = _mm_packus_epi32(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 440 | __m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16)); |
| RyoheiHagimoto | 0:0e0631af0305 | 441 | |
| RyoheiHagimoto | 0:0e0631af0305 | 442 | v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 443 | v_g1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16)); |
| RyoheiHagimoto | 0:0e0631af0305 | 444 | v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 445 | v_b0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16)); |
| RyoheiHagimoto | 0:0e0631af0305 | 446 | v_g0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 447 | v_b1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk4, 16), _mm_srli_epi32(layer1_chunk5, 16)); |
| RyoheiHagimoto | 0:0e0631af0305 | 448 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 449 | |
| RyoheiHagimoto | 0:0e0631af0305 | 450 | inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, |
| RyoheiHagimoto | 0:0e0631af0305 | 451 | __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1) |
| RyoheiHagimoto | 0:0e0631af0305 | 452 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 453 | __m128i v_mask = _mm_set1_epi32(0x0000ffff); |
| RyoheiHagimoto | 0:0e0631af0305 | 454 | |
| RyoheiHagimoto | 0:0e0631af0305 | 455 | __m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 456 | __m128i layer3_chunk4 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16)); |
| RyoheiHagimoto | 0:0e0631af0305 | 457 | __m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 458 | __m128i layer3_chunk5 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16)); |
| RyoheiHagimoto | 0:0e0631af0305 | 459 | __m128i layer3_chunk2 = _mm_packus_epi32(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 460 | __m128i layer3_chunk6 = _mm_packus_epi32(_mm_srli_epi32(v_b0, 16), _mm_srli_epi32(v_b1, 16)); |
| RyoheiHagimoto | 0:0e0631af0305 | 461 | __m128i layer3_chunk3 = _mm_packus_epi32(_mm_and_si128(v_a0, v_mask), _mm_and_si128(v_a1, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 462 | __m128i layer3_chunk7 = _mm_packus_epi32(_mm_srli_epi32(v_a0, 16), _mm_srli_epi32(v_a1, 16)); |
| RyoheiHagimoto | 0:0e0631af0305 | 463 | |
| RyoheiHagimoto | 0:0e0631af0305 | 464 | __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 465 | __m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16)); |
| RyoheiHagimoto | 0:0e0631af0305 | 466 | __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 467 | __m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16)); |
| RyoheiHagimoto | 0:0e0631af0305 | 468 | __m128i layer2_chunk2 = _mm_packus_epi32(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 469 | __m128i layer2_chunk6 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16)); |
| RyoheiHagimoto | 0:0e0631af0305 | 470 | __m128i layer2_chunk3 = _mm_packus_epi32(_mm_and_si128(layer3_chunk6, v_mask), _mm_and_si128(layer3_chunk7, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 471 | __m128i layer2_chunk7 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk6, 16), _mm_srli_epi32(layer3_chunk7, 16)); |
| RyoheiHagimoto | 0:0e0631af0305 | 472 | |
| RyoheiHagimoto | 0:0e0631af0305 | 473 | __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 474 | __m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16)); |
| RyoheiHagimoto | 0:0e0631af0305 | 475 | __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 476 | __m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16)); |
| RyoheiHagimoto | 0:0e0631af0305 | 477 | __m128i layer1_chunk2 = _mm_packus_epi32(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 478 | __m128i layer1_chunk6 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16)); |
| RyoheiHagimoto | 0:0e0631af0305 | 479 | __m128i layer1_chunk3 = _mm_packus_epi32(_mm_and_si128(layer2_chunk6, v_mask), _mm_and_si128(layer2_chunk7, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 480 | __m128i layer1_chunk7 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk6, 16), _mm_srli_epi32(layer2_chunk7, 16)); |
| RyoheiHagimoto | 0:0e0631af0305 | 481 | |
| RyoheiHagimoto | 0:0e0631af0305 | 482 | v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 483 | v_b0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16)); |
| RyoheiHagimoto | 0:0e0631af0305 | 484 | v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 485 | v_b1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16)); |
| RyoheiHagimoto | 0:0e0631af0305 | 486 | v_g0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 487 | v_a0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk4, 16), _mm_srli_epi32(layer1_chunk5, 16)); |
| RyoheiHagimoto | 0:0e0631af0305 | 488 | v_g1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk6, v_mask), _mm_and_si128(layer1_chunk7, v_mask)); |
| RyoheiHagimoto | 0:0e0631af0305 | 489 | v_a1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk6, 16), _mm_srli_epi32(layer1_chunk7, 16)); |
| RyoheiHagimoto | 0:0e0631af0305 | 490 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 491 | |
| RyoheiHagimoto | 0:0e0631af0305 | 492 | #endif // CV_SSE4_1 |
| RyoheiHagimoto | 0:0e0631af0305 | 493 | |
| RyoheiHagimoto | 0:0e0631af0305 | 494 | inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1) |
| RyoheiHagimoto | 0:0e0631af0305 | 495 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 496 | __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_g0); |
| RyoheiHagimoto | 0:0e0631af0305 | 497 | __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_g0); |
| RyoheiHagimoto | 0:0e0631af0305 | 498 | __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_g1); |
| RyoheiHagimoto | 0:0e0631af0305 | 499 | __m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_g1); |
| RyoheiHagimoto | 0:0e0631af0305 | 500 | |
| RyoheiHagimoto | 0:0e0631af0305 | 501 | __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk2); |
| RyoheiHagimoto | 0:0e0631af0305 | 502 | __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk2); |
| RyoheiHagimoto | 0:0e0631af0305 | 503 | __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk3); |
| RyoheiHagimoto | 0:0e0631af0305 | 504 | __m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk3); |
| RyoheiHagimoto | 0:0e0631af0305 | 505 | |
| RyoheiHagimoto | 0:0e0631af0305 | 506 | v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk2); |
| RyoheiHagimoto | 0:0e0631af0305 | 507 | v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk2); |
| RyoheiHagimoto | 0:0e0631af0305 | 508 | v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk3); |
| RyoheiHagimoto | 0:0e0631af0305 | 509 | v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk3); |
| RyoheiHagimoto | 0:0e0631af0305 | 510 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 511 | |
| RyoheiHagimoto | 0:0e0631af0305 | 512 | inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, |
| RyoheiHagimoto | 0:0e0631af0305 | 513 | __m128 & v_g1, __m128 & v_b0, __m128 & v_b1) |
| RyoheiHagimoto | 0:0e0631af0305 | 514 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 515 | __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_g1); |
| RyoheiHagimoto | 0:0e0631af0305 | 516 | __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_g1); |
| RyoheiHagimoto | 0:0e0631af0305 | 517 | __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_b0); |
| RyoheiHagimoto | 0:0e0631af0305 | 518 | __m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_b0); |
| RyoheiHagimoto | 0:0e0631af0305 | 519 | __m128 layer1_chunk4 = _mm_unpacklo_ps(v_g0, v_b1); |
| RyoheiHagimoto | 0:0e0631af0305 | 520 | __m128 layer1_chunk5 = _mm_unpackhi_ps(v_g0, v_b1); |
| RyoheiHagimoto | 0:0e0631af0305 | 521 | |
| RyoheiHagimoto | 0:0e0631af0305 | 522 | __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk3); |
| RyoheiHagimoto | 0:0e0631af0305 | 523 | __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk3); |
| RyoheiHagimoto | 0:0e0631af0305 | 524 | __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk4); |
| RyoheiHagimoto | 0:0e0631af0305 | 525 | __m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk4); |
| RyoheiHagimoto | 0:0e0631af0305 | 526 | __m128 layer2_chunk4 = _mm_unpacklo_ps(layer1_chunk2, layer1_chunk5); |
| RyoheiHagimoto | 0:0e0631af0305 | 527 | __m128 layer2_chunk5 = _mm_unpackhi_ps(layer1_chunk2, layer1_chunk5); |
| RyoheiHagimoto | 0:0e0631af0305 | 528 | |
| RyoheiHagimoto | 0:0e0631af0305 | 529 | v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk3); |
| RyoheiHagimoto | 0:0e0631af0305 | 530 | v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk3); |
| RyoheiHagimoto | 0:0e0631af0305 | 531 | v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk4); |
| RyoheiHagimoto | 0:0e0631af0305 | 532 | v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk4); |
| RyoheiHagimoto | 0:0e0631af0305 | 533 | v_b0 = _mm_unpacklo_ps(layer2_chunk2, layer2_chunk5); |
| RyoheiHagimoto | 0:0e0631af0305 | 534 | v_b1 = _mm_unpackhi_ps(layer2_chunk2, layer2_chunk5); |
| RyoheiHagimoto | 0:0e0631af0305 | 535 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 536 | |
| RyoheiHagimoto | 0:0e0631af0305 | 537 | inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1, |
| RyoheiHagimoto | 0:0e0631af0305 | 538 | __m128 & v_b0, __m128 & v_b1, __m128 & v_a0, __m128 & v_a1) |
| RyoheiHagimoto | 0:0e0631af0305 | 539 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 540 | __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_b0); |
| RyoheiHagimoto | 0:0e0631af0305 | 541 | __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_b0); |
| RyoheiHagimoto | 0:0e0631af0305 | 542 | __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_b1); |
| RyoheiHagimoto | 0:0e0631af0305 | 543 | __m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_b1); |
| RyoheiHagimoto | 0:0e0631af0305 | 544 | __m128 layer1_chunk4 = _mm_unpacklo_ps(v_g0, v_a0); |
| RyoheiHagimoto | 0:0e0631af0305 | 545 | __m128 layer1_chunk5 = _mm_unpackhi_ps(v_g0, v_a0); |
| RyoheiHagimoto | 0:0e0631af0305 | 546 | __m128 layer1_chunk6 = _mm_unpacklo_ps(v_g1, v_a1); |
| RyoheiHagimoto | 0:0e0631af0305 | 547 | __m128 layer1_chunk7 = _mm_unpackhi_ps(v_g1, v_a1); |
| RyoheiHagimoto | 0:0e0631af0305 | 548 | |
| RyoheiHagimoto | 0:0e0631af0305 | 549 | __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk4); |
| RyoheiHagimoto | 0:0e0631af0305 | 550 | __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk4); |
| RyoheiHagimoto | 0:0e0631af0305 | 551 | __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk5); |
| RyoheiHagimoto | 0:0e0631af0305 | 552 | __m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk5); |
| RyoheiHagimoto | 0:0e0631af0305 | 553 | __m128 layer2_chunk4 = _mm_unpacklo_ps(layer1_chunk2, layer1_chunk6); |
| RyoheiHagimoto | 0:0e0631af0305 | 554 | __m128 layer2_chunk5 = _mm_unpackhi_ps(layer1_chunk2, layer1_chunk6); |
| RyoheiHagimoto | 0:0e0631af0305 | 555 | __m128 layer2_chunk6 = _mm_unpacklo_ps(layer1_chunk3, layer1_chunk7); |
| RyoheiHagimoto | 0:0e0631af0305 | 556 | __m128 layer2_chunk7 = _mm_unpackhi_ps(layer1_chunk3, layer1_chunk7); |
| RyoheiHagimoto | 0:0e0631af0305 | 557 | |
| RyoheiHagimoto | 0:0e0631af0305 | 558 | v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk4); |
| RyoheiHagimoto | 0:0e0631af0305 | 559 | v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk4); |
| RyoheiHagimoto | 0:0e0631af0305 | 560 | v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk5); |
| RyoheiHagimoto | 0:0e0631af0305 | 561 | v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk5); |
| RyoheiHagimoto | 0:0e0631af0305 | 562 | v_b0 = _mm_unpacklo_ps(layer2_chunk2, layer2_chunk6); |
| RyoheiHagimoto | 0:0e0631af0305 | 563 | v_b1 = _mm_unpackhi_ps(layer2_chunk2, layer2_chunk6); |
| RyoheiHagimoto | 0:0e0631af0305 | 564 | v_a0 = _mm_unpacklo_ps(layer2_chunk3, layer2_chunk7); |
| RyoheiHagimoto | 0:0e0631af0305 | 565 | v_a1 = _mm_unpackhi_ps(layer2_chunk3, layer2_chunk7); |
| RyoheiHagimoto | 0:0e0631af0305 | 566 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 567 | |
| RyoheiHagimoto | 0:0e0631af0305 | 568 | inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1) |
| RyoheiHagimoto | 0:0e0631af0305 | 569 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 570 | const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1); |
| RyoheiHagimoto | 0:0e0631af0305 | 571 | |
| RyoheiHagimoto | 0:0e0631af0305 | 572 | __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo); |
| RyoheiHagimoto | 0:0e0631af0305 | 573 | __m128 layer2_chunk2 = _mm_shuffle_ps(v_r0, v_r1, mask_hi); |
| RyoheiHagimoto | 0:0e0631af0305 | 574 | __m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo); |
| RyoheiHagimoto | 0:0e0631af0305 | 575 | __m128 layer2_chunk3 = _mm_shuffle_ps(v_g0, v_g1, mask_hi); |
| RyoheiHagimoto | 0:0e0631af0305 | 576 | |
| RyoheiHagimoto | 0:0e0631af0305 | 577 | __m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo); |
| RyoheiHagimoto | 0:0e0631af0305 | 578 | __m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi); |
| RyoheiHagimoto | 0:0e0631af0305 | 579 | __m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo); |
| RyoheiHagimoto | 0:0e0631af0305 | 580 | __m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi); |
| RyoheiHagimoto | 0:0e0631af0305 | 581 | |
| RyoheiHagimoto | 0:0e0631af0305 | 582 | v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo); |
| RyoheiHagimoto | 0:0e0631af0305 | 583 | v_g0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi); |
| RyoheiHagimoto | 0:0e0631af0305 | 584 | v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo); |
| RyoheiHagimoto | 0:0e0631af0305 | 585 | v_g1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi); |
| RyoheiHagimoto | 0:0e0631af0305 | 586 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 587 | |
| RyoheiHagimoto | 0:0e0631af0305 | 588 | inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, |
| RyoheiHagimoto | 0:0e0631af0305 | 589 | __m128 & v_g1, __m128 & v_b0, __m128 & v_b1) |
| RyoheiHagimoto | 0:0e0631af0305 | 590 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 591 | const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1); |
| RyoheiHagimoto | 0:0e0631af0305 | 592 | |
| RyoheiHagimoto | 0:0e0631af0305 | 593 | __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo); |
| RyoheiHagimoto | 0:0e0631af0305 | 594 | __m128 layer2_chunk3 = _mm_shuffle_ps(v_r0, v_r1, mask_hi); |
| RyoheiHagimoto | 0:0e0631af0305 | 595 | __m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo); |
| RyoheiHagimoto | 0:0e0631af0305 | 596 | __m128 layer2_chunk4 = _mm_shuffle_ps(v_g0, v_g1, mask_hi); |
| RyoheiHagimoto | 0:0e0631af0305 | 597 | __m128 layer2_chunk2 = _mm_shuffle_ps(v_b0, v_b1, mask_lo); |
| RyoheiHagimoto | 0:0e0631af0305 | 598 | __m128 layer2_chunk5 = _mm_shuffle_ps(v_b0, v_b1, mask_hi); |
| RyoheiHagimoto | 0:0e0631af0305 | 599 | |
| RyoheiHagimoto | 0:0e0631af0305 | 600 | __m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo); |
| RyoheiHagimoto | 0:0e0631af0305 | 601 | __m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi); |
| RyoheiHagimoto | 0:0e0631af0305 | 602 | __m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo); |
| RyoheiHagimoto | 0:0e0631af0305 | 603 | __m128 layer1_chunk4 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi); |
| RyoheiHagimoto | 0:0e0631af0305 | 604 | __m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_lo); |
| RyoheiHagimoto | 0:0e0631af0305 | 605 | __m128 layer1_chunk5 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_hi); |
| RyoheiHagimoto | 0:0e0631af0305 | 606 | |
| RyoheiHagimoto | 0:0e0631af0305 | 607 | v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo); |
| RyoheiHagimoto | 0:0e0631af0305 | 608 | v_g1 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi); |
| RyoheiHagimoto | 0:0e0631af0305 | 609 | v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo); |
| RyoheiHagimoto | 0:0e0631af0305 | 610 | v_b0 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi); |
| RyoheiHagimoto | 0:0e0631af0305 | 611 | v_g0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_lo); |
| RyoheiHagimoto | 0:0e0631af0305 | 612 | v_b1 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_hi); |
| RyoheiHagimoto | 0:0e0631af0305 | 613 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 614 | |
| RyoheiHagimoto | 0:0e0631af0305 | 615 | inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1, |
| RyoheiHagimoto | 0:0e0631af0305 | 616 | __m128 & v_b0, __m128 & v_b1, __m128 & v_a0, __m128 & v_a1) |
| RyoheiHagimoto | 0:0e0631af0305 | 617 | { |
| RyoheiHagimoto | 0:0e0631af0305 | 618 | const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1); |
| RyoheiHagimoto | 0:0e0631af0305 | 619 | |
| RyoheiHagimoto | 0:0e0631af0305 | 620 | __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo); |
| RyoheiHagimoto | 0:0e0631af0305 | 621 | __m128 layer2_chunk4 = _mm_shuffle_ps(v_r0, v_r1, mask_hi); |
| RyoheiHagimoto | 0:0e0631af0305 | 622 | __m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo); |
| RyoheiHagimoto | 0:0e0631af0305 | 623 | __m128 layer2_chunk5 = _mm_shuffle_ps(v_g0, v_g1, mask_hi); |
| RyoheiHagimoto | 0:0e0631af0305 | 624 | __m128 layer2_chunk2 = _mm_shuffle_ps(v_b0, v_b1, mask_lo); |
| RyoheiHagimoto | 0:0e0631af0305 | 625 | __m128 layer2_chunk6 = _mm_shuffle_ps(v_b0, v_b1, mask_hi); |
| RyoheiHagimoto | 0:0e0631af0305 | 626 | __m128 layer2_chunk3 = _mm_shuffle_ps(v_a0, v_a1, mask_lo); |
| RyoheiHagimoto | 0:0e0631af0305 | 627 | __m128 layer2_chunk7 = _mm_shuffle_ps(v_a0, v_a1, mask_hi); |
| RyoheiHagimoto | 0:0e0631af0305 | 628 | |
| RyoheiHagimoto | 0:0e0631af0305 | 629 | __m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo); |
| RyoheiHagimoto | 0:0e0631af0305 | 630 | __m128 layer1_chunk4 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi); |
| RyoheiHagimoto | 0:0e0631af0305 | 631 | __m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo); |
| RyoheiHagimoto | 0:0e0631af0305 | 632 | __m128 layer1_chunk5 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi); |
| RyoheiHagimoto | 0:0e0631af0305 | 633 | __m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_lo); |
| RyoheiHagimoto | 0:0e0631af0305 | 634 | __m128 layer1_chunk6 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_hi); |
| RyoheiHagimoto | 0:0e0631af0305 | 635 | __m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk6, layer2_chunk7, mask_lo); |
| RyoheiHagimoto | 0:0e0631af0305 | 636 | __m128 layer1_chunk7 = _mm_shuffle_ps(layer2_chunk6, layer2_chunk7, mask_hi); |
| RyoheiHagimoto | 0:0e0631af0305 | 637 | |
| RyoheiHagimoto | 0:0e0631af0305 | 638 | v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo); |
| RyoheiHagimoto | 0:0e0631af0305 | 639 | v_b0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi); |
| RyoheiHagimoto | 0:0e0631af0305 | 640 | v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo); |
| RyoheiHagimoto | 0:0e0631af0305 | 641 | v_b1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi); |
| RyoheiHagimoto | 0:0e0631af0305 | 642 | v_g0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_lo); |
| RyoheiHagimoto | 0:0e0631af0305 | 643 | v_a0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_hi); |
| RyoheiHagimoto | 0:0e0631af0305 | 644 | v_g1 = _mm_shuffle_ps(layer1_chunk6, layer1_chunk7, mask_lo); |
| RyoheiHagimoto | 0:0e0631af0305 | 645 | v_a1 = _mm_shuffle_ps(layer1_chunk6, layer1_chunk7, mask_hi); |
| RyoheiHagimoto | 0:0e0631af0305 | 646 | } |
| RyoheiHagimoto | 0:0e0631af0305 | 647 | |
| RyoheiHagimoto | 0:0e0631af0305 | 648 | #endif // CV_SSE2 |
| RyoheiHagimoto | 0:0e0631af0305 | 649 | |
| RyoheiHagimoto | 0:0e0631af0305 | 650 | //! @} |
| RyoheiHagimoto | 0:0e0631af0305 | 651 | |
| RyoheiHagimoto | 0:0e0631af0305 | 652 | #endif //OPENCV_CORE_SSE_UTILS_HPP |