CMSIS DSP library

Dependents:   performance_timer Surfboard_ gps2rtty Capstone ... more

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_fir_fast_q15.c Source File

arm_fir_fast_q15.c

00001 /* ----------------------------------------------------------------------    
00002 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
00003 *    
00004 * $Date:        19. March 2015
00005 * $Revision:    V.1.4.5
00006 *    
00007 * Project:      CMSIS DSP Library    
00008 * Title:        arm_fir_fast_q15.c    
00009 *    
00010 * Description:  Q15 Fast FIR filter processing function.    
00011 *    
00012 * Target Processor: Cortex-M4/Cortex-M3
00013 *  
00014 * Redistribution and use in source and binary forms, with or without 
00015 * modification, are permitted provided that the following conditions
00016 * are met:
00017 *   - Redistributions of source code must retain the above copyright
00018 *     notice, this list of conditions and the following disclaimer.
00019 *   - Redistributions in binary form must reproduce the above copyright
00020 *     notice, this list of conditions and the following disclaimer in
00021 *     the documentation and/or other materials provided with the 
00022 *     distribution.
00023 *   - Neither the name of ARM LIMITED nor the names of its contributors
00024 *     may be used to endorse or promote products derived from this
00025 *     software without specific prior written permission.
00026 *
00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00038 * POSSIBILITY OF SUCH DAMAGE. 
00039 * -------------------------------------------------------------------- */
00040 
00041 #include "arm_math.h"
00042 
00043 /**    
00044  * @ingroup groupFilters    
00045  */
00046 
00047 /**    
00048  * @addtogroup FIR    
00049  * @{    
00050  */
00051 
00052 /**    
00053  * @param[in] *S points to an instance of the Q15 FIR filter structure.    
00054  * @param[in] *pSrc points to the block of input data.    
00055  * @param[out] *pDst points to the block of output data.    
00056  * @param[in] blockSize number of samples to process per call.    
00057  * @return none.    
00058  *    
00059  * <b>Scaling and Overflow Behavior:</b>    
00060  * \par    
00061  * This fast version uses a 32-bit accumulator with 2.30 format.    
00062  * The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit.    
00063  * Thus, if the accumulator result overflows it wraps around and distorts the result.    
00064  * In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits.    
00065  * The 2.30 accumulator is then truncated to 2.15 format and saturated to yield the 1.15 result.    
00066  *    
00067  * \par    
00068  * Refer to the function <code>arm_fir_q15()</code> for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion.  Both the slow and the fast versions use the same instance structure.    
00069  * Use the function <code>arm_fir_init_q15()</code> to initialize the filter structure.    
00070  */
00071 
00072 void arm_fir_fast_q15(
00073   const arm_fir_instance_q15 * S,
00074   q15_t * pSrc,
00075   q15_t * pDst,
00076   uint32_t blockSize)
00077 {
00078   q15_t *pState = S->pState;                     /* State pointer */
00079   q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
00080   q15_t *pStateCurnt;                            /* Points to the current sample of the state */
00081   q31_t acc0, acc1, acc2, acc3;                  /* Accumulators */
00082   q15_t *pb;                                     /* Temporary pointer for coefficient buffer */
00083   q15_t *px;                                     /* Temporary q31 pointer for SIMD state buffer accesses */
00084   q31_t x0, x1, x2, c0;                          /* Temporary variables to hold SIMD state and coefficient values */
00085   uint32_t numTaps = S->numTaps;                 /* Number of taps in the filter */
00086   uint32_t tapCnt, blkCnt;                       /* Loop counters */
00087 
00088 
00089   /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
00090   /* pStateCurnt points to the location where the new input data should be written */
00091   pStateCurnt = &(S->pState[(numTaps - 1u)]);
00092 
00093   /* Apply loop unrolling and compute 4 output values simultaneously.      
00094    * The variables acc0 ... acc3 hold output values that are being computed:      
00095    *      
00096    *    acc0 =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]      
00097    *    acc1 =  b[numTaps-1] * x[n-numTaps] +   b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]      
00098    *    acc2 =  b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] +   b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]      
00099    *    acc3 =  b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps]   +...+ b[0] * x[3]      
00100    */
00101 
00102   blkCnt = blockSize >> 2;
00103 
00104   /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.      
00105    ** a second loop below computes the remaining 1 to 3 samples. */
00106   while(blkCnt > 0u)
00107   {
00108     /* Copy four new input samples into the state buffer.      
00109      ** Use 32-bit SIMD to move the 16-bit data.  Only requires two copies. */
00110     *pStateCurnt++ = *pSrc++;
00111     *pStateCurnt++ = *pSrc++;
00112     *pStateCurnt++ = *pSrc++;
00113     *pStateCurnt++ = *pSrc++;
00114 
00115 
00116     /* Set all accumulators to zero */
00117     acc0 = 0;
00118     acc1 = 0;
00119     acc2 = 0;
00120     acc3 = 0;
00121 
00122     /* Typecast q15_t pointer to q31_t pointer for state reading in q31_t */
00123     px = pState;
00124 
00125     /* Typecast q15_t pointer to q31_t pointer for coefficient reading in q31_t */
00126     pb = pCoeffs;
00127 
00128     /* Read the first two samples from the state buffer:  x[n-N], x[n-N-1] */
00129     x0 = *__SIMD32(px)++;
00130 
00131     /* Read the third and forth samples from the state buffer: x[n-N-2], x[n-N-3] */
00132     x2 = *__SIMD32(px)++;
00133 
00134     /* Loop over the number of taps.  Unroll by a factor of 4.      
00135      ** Repeat until we've computed numTaps-(numTaps%4) coefficients. */
00136     tapCnt = numTaps >> 2;
00137 
00138     while(tapCnt > 0)
00139     {
00140       /* Read the first two coefficients using SIMD:  b[N] and b[N-1] coefficients */
00141       c0 = *__SIMD32(pb)++;
00142 
00143       /* acc0 +=  b[N] * x[n-N] + b[N-1] * x[n-N-1] */
00144       acc0 = __SMLAD(x0, c0, acc0);
00145 
00146       /* acc2 +=  b[N] * x[n-N-2] + b[N-1] * x[n-N-3] */
00147       acc2 = __SMLAD(x2, c0, acc2);
00148 
00149       /* pack  x[n-N-1] and x[n-N-2] */
00150 #ifndef ARM_MATH_BIG_ENDIAN
00151       x1 = __PKHBT(x2, x0, 0);
00152 #else
00153       x1 = __PKHBT(x0, x2, 0);
00154 #endif
00155 
00156       /* Read state x[n-N-4], x[n-N-5] */
00157       x0 = _SIMD32_OFFSET(px);
00158 
00159       /* acc1 +=  b[N] * x[n-N-1] + b[N-1] * x[n-N-2] */
00160       acc1 = __SMLADX(x1, c0, acc1);
00161 
00162       /* pack  x[n-N-3] and x[n-N-4] */
00163 #ifndef ARM_MATH_BIG_ENDIAN
00164       x1 = __PKHBT(x0, x2, 0);
00165 #else
00166       x1 = __PKHBT(x2, x0, 0);
00167 #endif
00168 
00169       /* acc3 +=  b[N] * x[n-N-3] + b[N-1] * x[n-N-4] */
00170       acc3 = __SMLADX(x1, c0, acc3);
00171 
00172       /* Read coefficients b[N-2], b[N-3] */
00173       c0 = *__SIMD32(pb)++;
00174 
00175       /* acc0 +=  b[N-2] * x[n-N-2] + b[N-3] * x[n-N-3] */
00176       acc0 = __SMLAD(x2, c0, acc0);
00177 
00178       /* Read state x[n-N-6], x[n-N-7] with offset */
00179       x2 = _SIMD32_OFFSET(px + 2u);
00180 
00181       /* acc2 +=  b[N-2] * x[n-N-4] + b[N-3] * x[n-N-5] */
00182       acc2 = __SMLAD(x0, c0, acc2);
00183 
00184       /* acc1 +=  b[N-2] * x[n-N-3] + b[N-3] * x[n-N-4] */
00185       acc1 = __SMLADX(x1, c0, acc1);
00186 
00187       /* pack  x[n-N-5] and x[n-N-6] */
00188 #ifndef ARM_MATH_BIG_ENDIAN
00189       x1 = __PKHBT(x2, x0, 0);
00190 #else
00191       x1 = __PKHBT(x0, x2, 0);
00192 #endif
00193 
00194       /* acc3 +=  b[N-2] * x[n-N-5] + b[N-3] * x[n-N-6] */
00195       acc3 = __SMLADX(x1, c0, acc3);
00196 
00197       /* Update state pointer for next state reading */
00198       px += 4u;
00199 
00200       /* Decrement tap count */
00201       tapCnt--;
00202 
00203     }
00204 
00205     /* If the filter length is not a multiple of 4, compute the remaining filter taps.       
00206      ** This is always be 2 taps since the filter length is even. */
00207     if((numTaps & 0x3u) != 0u)
00208     {
00209 
00210       /* Read last two coefficients */
00211       c0 = *__SIMD32(pb)++;
00212 
00213       /* Perform the multiply-accumulates */
00214       acc0 = __SMLAD(x0, c0, acc0);
00215       acc2 = __SMLAD(x2, c0, acc2);
00216 
00217       /* pack state variables */
00218 #ifndef ARM_MATH_BIG_ENDIAN
00219       x1 = __PKHBT(x2, x0, 0);
00220 #else
00221       x1 = __PKHBT(x0, x2, 0);
00222 #endif
00223 
00224       /* Read last state variables */
00225       x0 = *__SIMD32(px);
00226 
00227       /* Perform the multiply-accumulates */
00228       acc1 = __SMLADX(x1, c0, acc1);
00229 
00230       /* pack state variables */
00231 #ifndef ARM_MATH_BIG_ENDIAN
00232       x1 = __PKHBT(x0, x2, 0);
00233 #else
00234       x1 = __PKHBT(x2, x0, 0);
00235 #endif
00236 
00237       /* Perform the multiply-accumulates */
00238       acc3 = __SMLADX(x1, c0, acc3);
00239     }
00240 
00241     /* The results in the 4 accumulators are in 2.30 format.  Convert to 1.15 with saturation.       
00242      ** Then store the 4 outputs in the destination buffer. */
00243 
00244 #ifndef ARM_MATH_BIG_ENDIAN
00245 
00246     *__SIMD32(pDst)++ =
00247       __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
00248 
00249     *__SIMD32(pDst)++ =
00250       __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
00251 
00252 #else
00253 
00254     *__SIMD32(pDst)++ =
00255       __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
00256 
00257     *__SIMD32(pDst)++ =
00258       __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
00259 
00260 
00261 #endif /*      #ifndef ARM_MATH_BIG_ENDIAN       */
00262 
00263     /* Advance the state pointer by 4 to process the next group of 4 samples */
00264     pState = pState + 4u;
00265 
00266     /* Decrement the loop counter */
00267     blkCnt--;
00268   }
00269 
00270   /* If the blockSize is not a multiple of 4, compute any remaining output samples here.      
00271    ** No loop unrolling is used. */
00272   blkCnt = blockSize % 0x4u;
00273   while(blkCnt > 0u)
00274   {
00275     /* Copy two samples into state buffer */
00276     *pStateCurnt++ = *pSrc++;
00277 
00278     /* Set the accumulator to zero */
00279     acc0 = 0;
00280 
00281     /* Use SIMD to hold states and coefficients */
00282     px = pState;
00283     pb = pCoeffs;
00284 
00285     tapCnt = numTaps >> 1u;
00286 
00287     do
00288     {
00289 
00290       acc0 += (q31_t) * px++ * *pb++;
00291       acc0 += (q31_t) * px++ * *pb++;
00292 
00293       tapCnt--;
00294     }
00295     while(tapCnt > 0u);
00296 
00297     /* The result is in 2.30 format.  Convert to 1.15 with saturation.      
00298      ** Then store the output in the destination buffer. */
00299     *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
00300 
00301     /* Advance state pointer by 1 for the next sample */
00302     pState = pState + 1u;
00303 
00304     /* Decrement the loop counter */
00305     blkCnt--;
00306   }
00307 
00308   /* Processing is complete.      
00309    ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.      
00310    ** This prepares the state buffer for the next function call. */
00311 
00312   /* Points to the start of the state buffer */
00313   pStateCurnt = S->pState;
00314 
00315   /* Calculation of count for copying integer writes */
00316   tapCnt = (numTaps - 1u) >> 2;
00317 
00318   while(tapCnt > 0u)
00319   {
00320     *pStateCurnt++ = *pState++;
00321     *pStateCurnt++ = *pState++;
00322     *pStateCurnt++ = *pState++;
00323     *pStateCurnt++ = *pState++;
00324 
00325     tapCnt--;
00326 
00327   }
00328 
00329   /* Calculation of count for remaining q15_t data */
00330   tapCnt = (numTaps - 1u) % 0x4u;
00331 
00332   /* copy remaining data */
00333   while(tapCnt > 0u)
00334   {
00335     *pStateCurnt++ = *pState++;
00336 
00337     /* Decrement the loop counter */
00338     tapCnt--;
00339   }
00340 
00341 }
00342 
00343 /**    
00344  * @} end of FIR group    
00345  */