CMSIS DSP Library from CMSIS 2.0. See http://www.onarm.com/cmsis/ for full details

Dependents:   K22F_DSP_Matrix_least_square BNO055-ELEC3810 1BNO055 ECE4180Project--Slave2 ... more

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_fir_q15.c Source File

arm_fir_q15.c

00001 /* ----------------------------------------------------------------------  
00002 * Copyright (C) 2010 ARM Limited. All rights reserved.  
00003 *  
00004 * $Date:        29. November 2010  
00005 * $Revision:    V1.0.3  
00006 *  
00007 * Project:      CMSIS DSP Library  
00008 * Title:        arm_fir_q15.c  
00009 *  
00010 * Description:  Q15 FIR filter processing function.  
00011 *  
00012 * Target Processor: Cortex-M4/Cortex-M3
00013 *  
00014 * Version 1.0.3 2010/11/29 
00015 *    Re-organized the CMSIS folders and updated documentation.  
00016 *   
00017 * Version 1.0.2 2010/11/11  
00018 *    Documentation updated.   
00019 *  
00020 * Version 1.0.1 2010/10/05   
00021 *    Production release and review comments incorporated.  
00022 *  
00023 * Version 1.0.0 2010/09/20   
00024 *    Production release and review comments incorporated.  
00025 *  
00026 * Version 0.0.5  2010/04/26   
00027 *    incorporated review comments and updated with latest CMSIS layer  
00028 *  
00029 * Version 0.0.3  2010/03/10   
00030 *    Initial version  
00031 * -------------------------------------------------------------------- */ 
00032  
00033 #include "arm_math.h" 
00034  
00035 /**  
00036  * @ingroup groupFilters  
00037  */ 
00038  
00039 /**  
00040  * @addtogroup FIR  
00041  * @{  
00042  */ 
00043  
00044 /**  
00045  * @brief Processing function for the Q15 FIR filter.  
00046  * @param[in] *S points to an instance of the Q15 FIR structure.  
00047  * @param[in] *pSrc points to the block of input data.  
00048  * @param[out] *pDst points to the block of output data.  
00049  * @param[in]  blockSize number of samples to process per call.  
00050  * @return none.  
00051  *  
00052  * <b>Scaling and Overflow Behavior:</b>  
00053  * \par  
00054  * The function is implemented using a 64-bit internal accumulator.  
00055  * Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result.  
00056  * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.  
00057  * There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.  
00058  * After all additions have been performed, the accumulator is truncated to 34.15 format by discarding low 15 bits.  
00059  * Lastly, the accumulator is saturated to yield a result in 1.15 format.  
00060  *  
00061  * \par  
00062  * Refer to the function <code>arm_fir_fast_q15()</code> for a faster but less precise implementation of this function.  
00063  */ 
00064  
00065 void arm_fir_q15( 
00066   const arm_fir_instance_q15 * S, 
00067   q15_t * pSrc, 
00068   q15_t * pDst, 
00069   uint32_t blockSize) 
00070 { 
00071   q15_t *pState = S->pState;                     /* State pointer */ 
00072   q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */ 
00073   q15_t *pStateCurnt;                            /* Points to the current sample of the state */ 
00074   q15_t *px1;                                    /* Temporary q15 pointer for state buffer */ 
00075   q31_t *pb;                                     /* Temporary pointer for coefficient buffer */ 
00076   q31_t *px2;                                    /* Temporary q31 pointer for SIMD state buffer accesses */ 
00077   q31_t x0, x1, x2, x3, c0;                      /* Temporary variables to hold SIMD state and coefficient values */ 
00078   q63_t acc0, acc1, acc2, acc3;                  /* Accumulators */ 
00079   uint32_t numTaps = S->numTaps;                 /* Number of taps in the filter */ 
00080   uint32_t tapCnt, blkCnt;                       /* Loop counters */ 
00081  
00082   /* S->pState points to state array which contains previous frame (numTaps - 1) samples */ 
00083   /* pStateCurnt points to the location where the new input data should be written */ 
00084   pStateCurnt = &(S->pState[(numTaps - 1u)]); 
00085  
00086   /* Apply loop unrolling and compute 4 output values simultaneously.  
00087    * The variables acc0 ... acc3 hold output values that are being computed:  
00088    *  
00089    *    acc0 =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]  
00090    *    acc1 =  b[numTaps-1] * x[n-numTaps] +   b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]  
00091    *    acc2 =  b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] +   b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]  
00092    *    acc3 =  b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps]   +...+ b[0] * x[3]  
00093    */ 
00094   blkCnt = blockSize >> 2; 
00095  
00096   /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.  
00097    ** a second loop below computes the remaining 1 to 3 samples. */ 
00098   while(blkCnt > 0u) 
00099   { 
00100     /* Copy four new input samples into the state buffer.  
00101      ** Use 32-bit SIMD to move the 16-bit data.  Only requires two copies. */ 
00102     *__SIMD32(pStateCurnt)++ = *__SIMD32(pSrc)++; 
00103     *__SIMD32(pStateCurnt)++ = *__SIMD32(pSrc)++; 
00104  
00105     /* Set all accumulators to zero */ 
00106     acc0 = 0; 
00107     acc1 = 0; 
00108     acc2 = 0; 
00109     acc3 = 0; 
00110  
00111     /* Initialize state pointer of type q15 */ 
00112     px1 = pState; 
00113  
00114     /* Initialize coeff pointer of type q31 */ 
00115     pb = (q31_t *) (pCoeffs); 
00116  
00117     /* Read the first two samples from the state buffer:  x[n-N], x[n-N-1] */ 
00118     x0 = *(q31_t *) (px1++); 
00119  
00120     /* Read the third and forth samples from the state buffer: x[n-N-1], x[n-N-2] */ 
00121     x1 = *(q31_t *) (px1++); 
00122  
00123     /* Loop over the number of taps.  Unroll by a factor of 4.  
00124      ** Repeat until we've computed numTaps-4 coefficients. */ 
00125     tapCnt = numTaps >> 2; 
00126     do 
00127     { 
00128       /* Read the first two coefficients using SIMD:  b[N] and b[N-1] coefficients */ 
00129       c0 = *(pb++); 
00130  
00131       /* acc0 +=  b[N] * x[n-N] + b[N-1] * x[n-N-1] */ 
00132       acc0 = __SMLALD(x0, c0, acc0); 
00133  
00134       /* acc1 +=  b[N] * x[n-N-1] + b[N-1] * x[n-N-2] */ 
00135       acc1 = __SMLALD(x1, c0, acc1); 
00136  
00137       /* Read state x[n-N-2], x[n-N-3] */ 
00138       x2 = *(q31_t *) (px1++); 
00139  
00140       /* Read state x[n-N-3], x[n-N-4] */ 
00141       x3 = *(q31_t *) (px1++); 
00142  
00143       /* acc2 +=  b[N] * x[n-N-2] + b[N-1] * x[n-N-3] */ 
00144       acc2 = __SMLALD(x2, c0, acc2); 
00145  
00146       /* acc3 +=  b[N] * x[n-N-3] + b[N-1] * x[n-N-4] */ 
00147       acc3 = __SMLALD(x3, c0, acc3); 
00148  
00149       /* Read coefficients b[N-2], b[N-3] */ 
00150       c0 = *(pb++); 
00151  
00152       /* acc0 +=  b[N-2] * x[n-N-2] + b[N-3] * x[n-N-3] */ 
00153       acc0 = __SMLALD(x2, c0, acc0); 
00154  
00155       /* acc1 +=  b[N-2] * x[n-N-3] + b[N-3] * x[n-N-4] */ 
00156       acc1 = __SMLALD(x3, c0, acc1); 
00157  
00158       /* Read state x[n-N-4], x[n-N-5] */ 
00159       x0 = *(q31_t *) (px1++); 
00160  
00161       /* Read state x[n-N-5], x[n-N-6] */ 
00162       x1 = *(q31_t *) (px1++); 
00163  
00164       /* acc2 +=  b[N-2] * x[n-N-4] + b[N-3] * x[n-N-5] */ 
00165       acc2 = __SMLALD(x0, c0, acc2); 
00166  
00167       /* acc3 +=  b[N-2] * x[n-N-5] + b[N-3] * x[n-N-6] */ 
00168       acc3 = __SMLALD(x1, c0, acc3); 
00169       tapCnt--; 
00170  
00171     } 
00172     while(tapCnt > 0u); 
00173  
00174     /* If the filter length is not a multiple of 4, compute the remaining filter taps.  
00175      ** This is always be 2 taps since the filter length is even. */ 
00176     if((numTaps & 0x3u) != 0u) 
00177     { 
00178       /* Read 2 coefficients */ 
00179       c0 = *(pb++); 
00180       /* Fetch 4 state variables */ 
00181       x2 = *(q31_t *) (px1++); 
00182       x3 = *(q31_t *) (px1++); 
00183  
00184       /* Perform the multiply-accumulates */ 
00185       acc0 = __SMLALD(x0, c0, acc0); 
00186       acc1 = __SMLALD(x1, c0, acc1); 
00187       acc2 = __SMLALD(x2, c0, acc2); 
00188       acc3 = __SMLALD(x3, c0, acc3); 
00189     } 
00190  
00191     /* The results in the 4 accumulators are in 2.30 format.  Convert to 1.15 with saturation.  
00192      ** Then store the 4 outputs in the destination buffer. */ 
00193     *__SIMD32(pDst)++ = 
00194       __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16); 
00195     *__SIMD32(pDst)++ = 
00196       __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16); 
00197  
00198  
00199     /* Advance the state pointer by 4 to process the next group of 4 samples */ 
00200     pState = pState + 4; 
00201  
00202     /* Decrement the loop counter */ 
00203     blkCnt--; 
00204   } 
00205  
00206   /* If the blockSize is not a multiple of 4, compute any remaining output samples here.  
00207    ** No loop unrolling is used. */ 
00208   blkCnt = blockSize % 0x4u; 
00209   while(blkCnt > 0u) 
00210   { 
00211     /* Copy two samples into state buffer */ 
00212     *pStateCurnt++ = *pSrc++; 
00213  
00214     /* Set the accumulator to zero */ 
00215     acc0 = 0; 
00216  
00217     /* Use SIMD to hold states and coefficients */ 
00218     px2 = (q31_t *) pState; 
00219     pb = (q31_t *) (pCoeffs); 
00220     tapCnt = numTaps >> 1; 
00221  
00222     do 
00223     { 
00224       acc0 = __SMLALD(*px2++, *(pb++), acc0); 
00225       tapCnt--; 
00226     } 
00227     while(tapCnt > 0u); 
00228  
00229     /* The result is in 2.30 format.  Convert to 1.15 with saturation.  
00230      ** Then store the output in the destination buffer. */ 
00231     *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16)); 
00232  
00233     /* Advance state pointer by 1 for the next sample */ 
00234     pState = pState + 1; 
00235  
00236     /* Decrement the loop counter */ 
00237     blkCnt--; 
00238   } 
00239  
00240   /* Processing is complete.  
00241    ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.  
00242    ** This prepares the state buffer for the next function call. */ 
00243  
00244   /* Points to the start of the state buffer */ 
00245   pStateCurnt = S->pState; 
00246  
00247   /* Calculation of count for copying integer writes */ 
00248   tapCnt = (numTaps - 1u) >> 2; 
00249  
00250   while(tapCnt > 0u) 
00251   { 
00252     *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++; 
00253     *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++; 
00254  
00255     tapCnt--; 
00256  
00257   } 
00258  
00259   /* Calculation of count for remaining q15_t data */ 
00260   tapCnt = (numTaps - 1u) % 0x4u; 
00261  
00262   /* copy remaining data */ 
00263   while(tapCnt > 0u) 
00264   { 
00265     *pStateCurnt++ = *pState++; 
00266  
00267     /* Decrement the loop counter */ 
00268     tapCnt--; 
00269   } 
00270 } 
00271  
00272 /**  
00273  * @} end of FIR group  
00274  */