CMSIS DSP Library from CMSIS 2.0. See http://www.onarm.com/cmsis/ for full details

Dependents:   K22F_DSP_Matrix_least_square BNO055-ELEC3810 1BNO055 ECE4180Project--Slave2 ... more

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_fir_fast_q15.c Source File

arm_fir_fast_q15.c

00001 /* ----------------------------------------------------------------------  
00002 * Copyright (C) 2010 ARM Limited. All rights reserved.  
00003 *  
00004 * $Date:        29. November 2010  
00005 * $Revision:    V1.0.3  
00006 *  
00007 * Project:      CMSIS DSP Library  
00008 * Title:        arm_fir_fast_q15.c  
00009 *  
00010 * Description:  Q15 Fast FIR filter processing function.  
00011 *  
00012 * Target Processor: Cortex-M4/Cortex-M3
00013 *  
00014 * Version 1.0.3 2010/11/29 
00015 *    Re-organized the CMSIS folders and updated documentation.  
00016 *   
00017 * Version 1.0.2 2010/11/11  
00018 *    Documentation updated.   
00019 *  
00020 * Version 1.0.1 2010/10/05   
00021 *    Production release and review comments incorporated.  
00022 *  
00023 * Version 1.0.0 2010/09/20   
00024 *    Production release and review comments incorporated.  
00025 *  
00026 * Version 0.0.9  2010/08/16   
00027 *    Initial version  
00028 *  
00029 * -------------------------------------------------------------------- */ 
00030  
00031 #include "arm_math.h" 
00032  
00033 /**  
00034  * @ingroup groupFilters  
00035  */ 
00036  
00037 /**  
00038  * @addtogroup FIR  
00039  * @{  
00040  */ 
00041  
00042 /**  
00043  * @param[in] *S points to an instance of the Q15 FIR filter structure.  
00044  * @param[in] *pSrc points to the block of input data.  
00045  * @param[out] *pDst points to the block of output data.  
00046  * @param[in] blockSize number of samples to process per call.  
00047  * @return none.  
00048  *  
00049  * <b>Scaling and Overflow Behavior:</b>  
00050  * \par  
00051  * This fast version uses a 32-bit accumulator with 2.30 format.  
00052  * The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit.  
00053  * Thus, if the accumulator result overflows it wraps around and distorts the result.  
00054  * In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits.  
00055  * The 2.30 accumulator is then truncated to 2.15 format and saturated to yield the 1.15 result.  
00056  *  
00057  * \par  
00058  * Refer to the function <code>arm_fir_q15()</code> for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion.  Both the slow and the fast versions use the same instance structure.  
00059  * Use the function <code>arm_fir_init_q15()</code> to initialize the filter structure.  
00060  */ 
00061  
00062 void arm_fir_fast_q15( 
00063   const arm_fir_instance_q15 * S, 
00064   q15_t * pSrc, 
00065   q15_t * pDst, 
00066   uint32_t blockSize) 
00067 { 
00068   q15_t *pState = S->pState;                     /* State pointer */ 
00069   q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */ 
00070   q15_t *pStateCurnt;                            /* Points to the current sample of the state */ 
00071   q15_t *px1;                                    /* Temporary q15 pointer for state buffer */ 
00072   q31_t *pb;                                     /* Temporary pointer for coefficient buffer */ 
00073   q31_t *px2;                                    /* Temporary q31 pointer for SIMD state buffer accesses */ 
00074   q31_t x0, x1, x2, x3, c0;                      /* Temporary variables to hold SIMD state and coefficient values */ 
00075   q31_t acc0, acc1, acc2, acc3;                  /* Accumulators */ 
00076   uint32_t numTaps = S->numTaps;                 /* Number of taps in the filter */ 
00077   uint32_t tapCnt, blkCnt;                       /* Loop counters */ 
00078  
00079   /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */ 
00080   /* pStateCurnt points to the location where the new input data should be written */ 
00081   pStateCurnt = &(S->pState[(numTaps - 1u)]); 
00082  
00083   /* Apply loop unrolling and compute 4 output values simultaneously.  
00084    * The variables acc0 ... acc3 hold output values that are being computed:  
00085    *  
00086    *    acc0 =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]  
00087    *    acc1 =  b[numTaps-1] * x[n-numTaps] +   b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]  
00088    *    acc2 =  b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] +   b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]  
00089    *    acc3 =  b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps]   +...+ b[0] * x[3]  
00090    */ 
00091   blkCnt = blockSize >> 2; 
00092  
00093   /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.  
00094    ** a second loop below computes the remaining 1 to 3 samples. */ 
00095   while(blkCnt > 0u) 
00096   { 
00097     /* Copy four new input samples into the state buffer.  
00098      ** Use 32-bit SIMD to move the 16-bit data.  Only requires two copies. */ 
00099     *__SIMD32(pStateCurnt)++ = *__SIMD32(pSrc)++; 
00100     *__SIMD32(pStateCurnt)++ = *__SIMD32(pSrc)++; 
00101  
00102     /* Set all accumulators to zero */ 
00103     acc0 = 0; 
00104     acc1 = 0; 
00105     acc2 = 0; 
00106     acc3 = 0; 
00107  
00108     /* Initialize state pointer of type q15 */ 
00109     px1 = pState; 
00110  
00111     /* Initialize coeff pointer of type q31 */ 
00112     pb = (q31_t *) (pCoeffs); 
00113  
00114     /* Read the first two samples from the state buffer:  x[n-N], x[n-N-1] */ 
00115     x0 = *(q31_t *) (px1++); 
00116  
00117     /* Read the third and forth samples from the state buffer: x[n-N-1], x[n-N-2] */ 
00118     x1 = *(q31_t *) (px1++); 
00119  
00120     /* Loop over the number of taps.  Unroll by a factor of 4.  
00121      ** Repeat until we've computed numTaps-4 coefficients. */ 
00122     tapCnt = numTaps >> 2; 
00123     do 
00124     { 
00125       /* Read the first two coefficients using SIMD:  b[N] and b[N-1] coefficients */ 
00126       c0 = *(pb++); 
00127  
00128       /* acc0 +=  b[N] * x[n-N] + b[N-1] * x[n-N-1] */ 
00129       acc0 = __SMLAD(x0, c0, acc0); 
00130  
00131       /* acc1 +=  b[N] * x[n-N-1] + b[N-1] * x[n-N-2] */ 
00132       acc1 = __SMLAD(x1, c0, acc1); 
00133  
00134       /* Read state x[n-N-2], x[n-N-3] */ 
00135       x2 = *(q31_t *) (px1++); 
00136  
00137       /* Read state x[n-N-3], x[n-N-4] */ 
00138       x3 = *(q31_t *) (px1++); 
00139  
00140       /* acc2 +=  b[N] * x[n-N-2] + b[N-1] * x[n-N-3] */ 
00141       acc2 = __SMLAD(x2, c0, acc2); 
00142  
00143       /* acc3 +=  b[N] * x[n-N-3] + b[N-1] * x[n-N-4] */ 
00144       acc3 = __SMLAD(x3, c0, acc3); 
00145  
00146       /* Read coefficients b[N-2], b[N-3] */ 
00147       c0 = *(pb++); 
00148  
00149       /* acc0 +=  b[N-2] * x[n-N-2] + b[N-3] * x[n-N-3] */ 
00150       acc0 = __SMLAD(x2, c0, acc0); 
00151  
00152       /* acc1 +=  b[N-2] * x[n-N-3] + b[N-3] * x[n-N-4] */ 
00153       acc1 = __SMLAD(x3, c0, acc1); 
00154  
00155       /* Read state x[n-N-4], x[n-N-5] */ 
00156       x0 = *(q31_t *) (px1++); 
00157  
00158       /* Read state x[n-N-5], x[n-N-6] */ 
00159       x1 = *(q31_t *) (px1++); 
00160  
00161       /* acc2 +=  b[N-2] * x[n-N-4] + b[N-3] * x[n-N-5] */ 
00162       acc2 = __SMLAD(x0, c0, acc2); 
00163  
00164       /* acc3 +=  b[N-2] * x[n-N-5] + b[N-3] * x[n-N-6] */ 
00165       acc3 = __SMLAD(x1, c0, acc3); 
00166       tapCnt--; 
00167  
00168     } 
00169     while(tapCnt > 0u); 
00170  
00171     /* If the filter length is not a multiple of 4, compute the remaining filter taps.  
00172      ** This is always 2 taps since the filter length is always even. */ 
00173     if((numTaps & 0x3u) != 0u) 
00174     { 
00175       /* Read 2 coefficients */ 
00176       c0 = *(pb++); 
00177       /* Fetch 4 state variables */ 
00178       x2 = *(q31_t *) (px1++); 
00179       x3 = *(q31_t *) (px1++); 
00180  
00181       /* Perform the multiply-accumulates */ 
00182       acc0 = __SMLAD(x0, c0, acc0); 
00183       acc1 = __SMLAD(x1, c0, acc1); 
00184       acc2 = __SMLAD(x2, c0, acc2); 
00185       acc3 = __SMLAD(x3, c0, acc3); 
00186     } 
00187  
00188     /* The results in the 4 accumulators are in 2.30 format.  Convert to 1.15 with saturation.  
00189      ** Then store the 4 outputs in the destination buffer. */ 
00190     *__SIMD32(pDst)++ = __PKHBT((acc0 >> 15), (acc1 >> 15), 16u); 
00191     *__SIMD32(pDst)++ = __PKHBT((acc2 >> 15), (acc3 >> 15), 16u); 
00192  
00193  
00194     /* Advance the state pointer by 4 to process the next group of 4 samples */ 
00195     pState = pState + 4; 
00196  
00197     /* Decrement the loop counter */ 
00198     blkCnt--; 
00199   } 
00200  
00201   /* If the blockSize is not a multiple of 4, compute any remaining output samples here.  
00202    ** No loop unrolling is used. */ 
00203   blkCnt = blockSize % 0x4u; 
00204   while(blkCnt > 0u) 
00205   { 
00206     /* Copy two samples into state buffer */ 
00207     *pStateCurnt++ = *pSrc++; 
00208  
00209     /* Set the accumulator to zero */ 
00210     acc0 = 0; 
00211  
00212     /* Use SIMD to hold states and coefficients */ 
00213     px2 = (q31_t *) pState; 
00214     pb = (q31_t *) (pCoeffs); 
00215     tapCnt = numTaps >> 1; 
00216  
00217     do 
00218     { 
00219       acc0 = __SMLAD(*px2++, *(pb++), acc0); 
00220       tapCnt--; 
00221     } 
00222     while(tapCnt > 0u); 
00223  
00224     /* The result is in 2.30 format.  Convert to 1.15 with saturation.  
00225      ** Then store the output in the destination buffer. */ 
00226     *pDst++ = (q15_t) ((acc0 >> 15)); 
00227  
00228     /* Advance state pointer by 1 for the next sample */ 
00229     pState = pState + 1; 
00230  
00231     /* Decrement the loop counter */ 
00232     blkCnt--; 
00233   } 
00234  
00235   /* Processing is complete.  
00236    ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.  
00237    ** This prepares the state buffer for the next function call. */ 
00238  
00239   /* Points to the start of the state buffer */ 
00240   pStateCurnt = S->pState; 
00241   /* Calculation of count for copying integer writes */ 
00242   tapCnt = (numTaps - 1u) >> 2; 
00243  
00244   while(tapCnt > 0u) 
00245   { 
00246     *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++; 
00247     *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++; 
00248  
00249     tapCnt--; 
00250   } 
00251  
00252   /* Calculation of count for remaining q15_t data */ 
00253   tapCnt = (numTaps - 1u) % 0x4u; 
00254  
00255   /* copy remaining data */ 
00256   while(tapCnt > 0u) 
00257   { 
00258     *pStateCurnt++ = *pState++; 
00259  
00260     /* Decrement the loop counter */ 
00261     tapCnt--; 
00262   } 
00263 } 
00264  
00265 /**  
00266  * @} end of FIR group  
00267  */