ex

Fork of mbed-os-example-mbed5-blinky by mbed-os-examples

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers filters_sse.h Source File

filters_sse.h

Go to the documentation of this file.
00001 /* Copyright (C) 2002 Jean-Marc Valin */
00002 /**
00003    @file filters_sse.h
00004    @brief Various analysis/synthesis filters (SSE version)
00005 */
00006 /*
00007    Redistribution and use in source and binary forms, with or without
00008    modification, are permitted provided that the following conditions
00009    are met:
00010    
00011    - Redistributions of source code must retain the above copyright
00012    notice, this list of conditions and the following disclaimer.
00013    
00014    - Redistributions in binary form must reproduce the above copyright
00015    notice, this list of conditions and the following disclaimer in the
00016    documentation and/or other materials provided with the distribution.
00017    
00018    - Neither the name of the Xiph.org Foundation nor the names of its
00019    contributors may be used to endorse or promote products derived from
00020    this software without specific prior written permission.
00021    
00022    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00023    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00024    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
00025    A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
00026    CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00027    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00028    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
00029    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
00030    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
00031    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00032    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00033 */
00034 
00035 #include <xmmintrin.h>
00036 
00037 void filter_mem16_10(const float *x, const float *_num, const float *_den, float *y, int N, int ord, float *_mem)
00038 {
00039    __m128 num[3], den[3], mem[3];
00040 
00041    int i;
00042 
00043    /* Copy numerator, denominator and memory to aligned xmm */
00044    for (i=0;i<2;i++)
00045    {
00046       mem[i] = _mm_loadu_ps(_mem+4*i);
00047       num[i] = _mm_loadu_ps(_num+4*i);
00048       den[i] = _mm_loadu_ps(_den+4*i);
00049    }
00050    mem[2] = _mm_setr_ps(_mem[8], _mem[9], 0, 0);
00051    num[2] = _mm_setr_ps(_num[8], _num[9], 0, 0);
00052    den[2] = _mm_setr_ps(_den[8], _den[9], 0, 0);
00053    
00054    for (i=0;i<N;i++)
00055    {
00056       __m128 xx;
00057       __m128 yy;
00058       /* Compute next filter result */
00059       xx = _mm_load_ps1(x+i);
00060       yy = _mm_add_ss(xx, mem[0]);
00061       _mm_store_ss(y+i, yy);
00062       yy = _mm_shuffle_ps(yy, yy, 0);
00063       
00064       /* Update memory */
00065       mem[0] = _mm_move_ss(mem[0], mem[1]);
00066       mem[0] = _mm_shuffle_ps(mem[0], mem[0], 0x39);
00067 
00068       mem[0] = _mm_add_ps(mem[0], _mm_mul_ps(xx, num[0]));
00069       mem[0] = _mm_sub_ps(mem[0], _mm_mul_ps(yy, den[0]));
00070 
00071       mem[1] = _mm_move_ss(mem[1], mem[2]);
00072       mem[1] = _mm_shuffle_ps(mem[1], mem[1], 0x39);
00073 
00074       mem[1] = _mm_add_ps(mem[1], _mm_mul_ps(xx, num[1]));
00075       mem[1] = _mm_sub_ps(mem[1], _mm_mul_ps(yy, den[1]));
00076 
00077       mem[2] = _mm_shuffle_ps(mem[2], mem[2], 0xfd);
00078 
00079       mem[2] = _mm_add_ps(mem[2], _mm_mul_ps(xx, num[2]));
00080       mem[2] = _mm_sub_ps(mem[2], _mm_mul_ps(yy, den[2]));
00081    }
00082    /* Put memory back in its place */
00083    _mm_storeu_ps(_mem, mem[0]);
00084    _mm_storeu_ps(_mem+4, mem[1]);
00085    _mm_store_ss(_mem+8, mem[2]);
00086    mem[2] = _mm_shuffle_ps(mem[2], mem[2], 0x55);
00087    _mm_store_ss(_mem+9, mem[2]);
00088 }
00089 
00090 void filter_mem16_8(const float *x, const float *_num, const float *_den, float *y, int N, int ord, float *_mem)
00091 {
00092    __m128 num[2], den[2], mem[2];
00093 
00094    int i;
00095 
00096    /* Copy numerator, denominator and memory to aligned xmm */
00097    for (i=0;i<2;i++)
00098    {
00099       mem[i] = _mm_loadu_ps(_mem+4*i);
00100       num[i] = _mm_loadu_ps(_num+4*i);
00101       den[i] = _mm_loadu_ps(_den+4*i);
00102    }
00103    
00104    for (i=0;i<N;i++)
00105    {
00106       __m128 xx;
00107       __m128 yy;
00108       /* Compute next filter result */
00109       xx = _mm_load_ps1(x+i);
00110       yy = _mm_add_ss(xx, mem[0]);
00111       _mm_store_ss(y+i, yy);
00112       yy = _mm_shuffle_ps(yy, yy, 0);
00113       
00114       /* Update memory */
00115       mem[0] = _mm_move_ss(mem[0], mem[1]);
00116       mem[0] = _mm_shuffle_ps(mem[0], mem[0], 0x39);
00117 
00118       mem[0] = _mm_add_ps(mem[0], _mm_mul_ps(xx, num[0]));
00119       mem[0] = _mm_sub_ps(mem[0], _mm_mul_ps(yy, den[0]));
00120 
00121       mem[1] = _mm_sub_ss(mem[1], mem[1]);
00122       mem[1] = _mm_shuffle_ps(mem[1], mem[1], 0x39);
00123 
00124       mem[1] = _mm_add_ps(mem[1], _mm_mul_ps(xx, num[1]));
00125       mem[1] = _mm_sub_ps(mem[1], _mm_mul_ps(yy, den[1]));
00126    }
00127    /* Put memory back in its place */
00128    _mm_storeu_ps(_mem, mem[0]);
00129    _mm_storeu_ps(_mem+4, mem[1]);
00130 }
00131 
00132 
00133 #define OVERRIDE_FILTER_MEM16
00134 void filter_mem16(const float *x, const float *_num, const float *_den, float *y, int N, int ord, float *_mem, char *stack)
00135 {
00136    if(ord==10)
00137       filter_mem16_10(x, _num, _den, y, N, ord, _mem);
00138    else if (ord==8)
00139       filter_mem16_8(x, _num, _den, y, N, ord, _mem);
00140 }
00141 
00142 
00143 
00144 void iir_mem16_10(const float *x, const float *_den, float *y, int N, int ord, float *_mem)
00145 {
00146    __m128 den[3], mem[3];
00147 
00148    int i;
00149 
00150    /* Copy numerator, denominator and memory to aligned xmm */
00151    for (i=0;i<2;i++)
00152    {
00153       mem[i] = _mm_loadu_ps(_mem+4*i);
00154       den[i] = _mm_loadu_ps(_den+4*i);
00155    }
00156    mem[2] = _mm_setr_ps(_mem[8], _mem[9], 0, 0);
00157    den[2] = _mm_setr_ps(_den[8], _den[9], 0, 0);
00158    
00159    for (i=0;i<N;i++)
00160    {
00161       __m128 xx;
00162       __m128 yy;
00163       /* Compute next filter result */
00164       xx = _mm_load_ps1(x+i);
00165       yy = _mm_add_ss(xx, mem[0]);
00166       _mm_store_ss(y+i, yy);
00167       yy = _mm_shuffle_ps(yy, yy, 0);
00168       
00169       /* Update memory */
00170       mem[0] = _mm_move_ss(mem[0], mem[1]);
00171       mem[0] = _mm_shuffle_ps(mem[0], mem[0], 0x39);
00172 
00173       mem[0] = _mm_sub_ps(mem[0], _mm_mul_ps(yy, den[0]));
00174 
00175       mem[1] = _mm_move_ss(mem[1], mem[2]);
00176       mem[1] = _mm_shuffle_ps(mem[1], mem[1], 0x39);
00177 
00178       mem[1] = _mm_sub_ps(mem[1], _mm_mul_ps(yy, den[1]));
00179 
00180       mem[2] = _mm_shuffle_ps(mem[2], mem[2], 0xfd);
00181 
00182       mem[2] = _mm_sub_ps(mem[2], _mm_mul_ps(yy, den[2]));
00183    }
00184    /* Put memory back in its place */
00185    _mm_storeu_ps(_mem, mem[0]);
00186    _mm_storeu_ps(_mem+4, mem[1]);
00187    _mm_store_ss(_mem+8, mem[2]);
00188    mem[2] = _mm_shuffle_ps(mem[2], mem[2], 0x55);
00189    _mm_store_ss(_mem+9, mem[2]);
00190 }
00191 
00192 
00193 void iir_mem16_8(const float *x, const float *_den, float *y, int N, int ord, float *_mem)
00194 {
00195    __m128 den[2], mem[2];
00196 
00197    int i;
00198 
00199    /* Copy numerator, denominator and memory to aligned xmm */
00200    for (i=0;i<2;i++)
00201    {
00202       mem[i] = _mm_loadu_ps(_mem+4*i);
00203       den[i] = _mm_loadu_ps(_den+4*i);
00204    }
00205    
00206    for (i=0;i<N;i++)
00207    {
00208       __m128 xx;
00209       __m128 yy;
00210       /* Compute next filter result */
00211       xx = _mm_load_ps1(x+i);
00212       yy = _mm_add_ss(xx, mem[0]);
00213       _mm_store_ss(y+i, yy);
00214       yy = _mm_shuffle_ps(yy, yy, 0);
00215       
00216       /* Update memory */
00217       mem[0] = _mm_move_ss(mem[0], mem[1]);
00218       mem[0] = _mm_shuffle_ps(mem[0], mem[0], 0x39);
00219 
00220       mem[0] = _mm_sub_ps(mem[0], _mm_mul_ps(yy, den[0]));
00221 
00222       mem[1] = _mm_sub_ss(mem[1], mem[1]);
00223       mem[1] = _mm_shuffle_ps(mem[1], mem[1], 0x39);
00224 
00225       mem[1] = _mm_sub_ps(mem[1], _mm_mul_ps(yy, den[1]));
00226    }
00227    /* Put memory back in its place */
00228    _mm_storeu_ps(_mem, mem[0]);
00229    _mm_storeu_ps(_mem+4, mem[1]);
00230 }
00231 
00232 #define OVERRIDE_IIR_MEM16
00233 void iir_mem16(const float *x, const float *_den, float *y, int N, int ord, float *_mem, char *stack)
00234 {
00235    if(ord==10)
00236       iir_mem16_10(x, _den, y, N, ord, _mem);
00237    else if (ord==8)
00238       iir_mem16_8(x, _den, y, N, ord, _mem);
00239 }
00240 
00241 
00242 void fir_mem16_10(const float *x, const float *_num, float *y, int N, int ord, float *_mem)
00243 {
00244    __m128 num[3], mem[3];
00245 
00246    int i;
00247 
00248    /* Copy numerator, denominator and memory to aligned xmm */
00249    for (i=0;i<2;i++)
00250    {
00251       mem[i] = _mm_loadu_ps(_mem+4*i);
00252       num[i] = _mm_loadu_ps(_num+4*i);
00253    }
00254    mem[2] = _mm_setr_ps(_mem[8], _mem[9], 0, 0);
00255    num[2] = _mm_setr_ps(_num[8], _num[9], 0, 0);
00256    
00257    for (i=0;i<N;i++)
00258    {
00259       __m128 xx;
00260       __m128 yy;
00261       /* Compute next filter result */
00262       xx = _mm_load_ps1(x+i);
00263       yy = _mm_add_ss(xx, mem[0]);
00264       _mm_store_ss(y+i, yy);
00265       yy = _mm_shuffle_ps(yy, yy, 0);
00266       
00267       /* Update memory */
00268       mem[0] = _mm_move_ss(mem[0], mem[1]);
00269       mem[0] = _mm_shuffle_ps(mem[0], mem[0], 0x39);
00270 
00271       mem[0] = _mm_add_ps(mem[0], _mm_mul_ps(xx, num[0]));
00272 
00273       mem[1] = _mm_move_ss(mem[1], mem[2]);
00274       mem[1] = _mm_shuffle_ps(mem[1], mem[1], 0x39);
00275 
00276       mem[1] = _mm_add_ps(mem[1], _mm_mul_ps(xx, num[1]));
00277 
00278       mem[2] = _mm_shuffle_ps(mem[2], mem[2], 0xfd);
00279 
00280       mem[2] = _mm_add_ps(mem[2], _mm_mul_ps(xx, num[2]));
00281    }
00282    /* Put memory back in its place */
00283    _mm_storeu_ps(_mem, mem[0]);
00284    _mm_storeu_ps(_mem+4, mem[1]);
00285    _mm_store_ss(_mem+8, mem[2]);
00286    mem[2] = _mm_shuffle_ps(mem[2], mem[2], 0x55);
00287    _mm_store_ss(_mem+9, mem[2]);
00288 }
00289 
00290 void fir_mem16_8(const float *x, const float *_num, float *y, int N, int ord, float *_mem)
00291 {
00292    __m128 num[2], mem[2];
00293 
00294    int i;
00295 
00296    /* Copy numerator, denominator and memory to aligned xmm */
00297    for (i=0;i<2;i++)
00298    {
00299       mem[i] = _mm_loadu_ps(_mem+4*i);
00300       num[i] = _mm_loadu_ps(_num+4*i);
00301    }
00302    
00303    for (i=0;i<N;i++)
00304    {
00305       __m128 xx;
00306       __m128 yy;
00307       /* Compute next filter result */
00308       xx = _mm_load_ps1(x+i);
00309       yy = _mm_add_ss(xx, mem[0]);
00310       _mm_store_ss(y+i, yy);
00311       yy = _mm_shuffle_ps(yy, yy, 0);
00312       
00313       /* Update memory */
00314       mem[0] = _mm_move_ss(mem[0], mem[1]);
00315       mem[0] = _mm_shuffle_ps(mem[0], mem[0], 0x39);
00316 
00317       mem[0] = _mm_add_ps(mem[0], _mm_mul_ps(xx, num[0]));
00318 
00319       mem[1] = _mm_sub_ss(mem[1], mem[1]);
00320       mem[1] = _mm_shuffle_ps(mem[1], mem[1], 0x39);
00321 
00322       mem[1] = _mm_add_ps(mem[1], _mm_mul_ps(xx, num[1]));
00323    }
00324    /* Put memory back in its place */
00325    _mm_storeu_ps(_mem, mem[0]);
00326    _mm_storeu_ps(_mem+4, mem[1]);
00327 }
00328 
00329 #define OVERRIDE_FIR_MEM16
00330 void fir_mem16(const float *x, const float *_num, float *y, int N, int ord, float *_mem, char *stack)
00331 {
00332    if(ord==10)
00333       fir_mem16_10(x, _num, y, N, ord, _mem);
00334    else if (ord==8)
00335       fir_mem16_8(x, _num, y, N, ord, _mem);
00336 }