CMSIS DSP library

Dependents:   KL25Z_FFT_Demo Hat_Board_v5_1 KL25Z_FFT_Demo_tony KL25Z_FFT_Demo_tony ... more

Fork of mbed-dsp by mbed official

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_fir_fast_q31.c Source File

arm_fir_fast_q31.c

00001 /* ----------------------------------------------------------------------    
00002 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.    
00003 *    
00004 * $Date:        17. January 2013  
00005 * $Revision:    V1.4.1  
00006 *    
00007 * Project:      CMSIS DSP Library    
00008 * Title:        arm_fir_fast_q31.c    
00009 *    
00010 * Description:  Processing function for the Q31 Fast FIR filter.    
00011 *    
00012 * Target Processor: Cortex-M4/Cortex-M3
00013 *  
00014 * Redistribution and use in source and binary forms, with or without 
00015 * modification, are permitted provided that the following conditions
00016 * are met:
00017 *   - Redistributions of source code must retain the above copyright
00018 *     notice, this list of conditions and the following disclaimer.
00019 *   - Redistributions in binary form must reproduce the above copyright
00020 *     notice, this list of conditions and the following disclaimer in
00021 *     the documentation and/or other materials provided with the 
00022 *     distribution.
00023 *   - Neither the name of ARM LIMITED nor the names of its contributors
00024 *     may be used to endorse or promote products derived from this
00025 *     software without specific prior written permission.
00026 *
00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00038 * POSSIBILITY OF SUCH DAMAGE.    
00039 * -------------------------------------------------------------------- */
00040 
00041 #include "arm_math.h"
00042 
00043 /**    
00044  * @ingroup groupFilters    
00045  */
00046 
00047 /**    
00048  * @addtogroup FIR    
00049  * @{    
00050  */
00051 
00052 /**    
00053  * @param[in] *S points to an instance of the Q31 structure.    
00054  * @param[in] *pSrc points to the block of input data.    
00055  * @param[out] *pDst points to the block output data.    
00056  * @param[in] blockSize number of samples to process per call.    
00057  * @return none.    
00058  *    
00059  * <b>Scaling and Overflow Behavior:</b>    
00060  *    
00061  * \par    
00062  * This function is optimized for speed at the expense of fixed-point precision and overflow protection.    
00063  * The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format.    
00064  * These intermediate results are added to a 2.30 accumulator.    
00065  * Finally, the accumulator is saturated and converted to a 1.31 result.    
00066  * The fast version has the same overflow behavior as the standard version and provides less precision since it discards the low 32 bits of each multiplication result.    
00067  * In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits.    
00068  *    
00069  * \par    
00070  * Refer to the function <code>arm_fir_q31()</code> for a slower implementation of this function which uses a 64-bit accumulator to provide higher precision.  Both the slow and the fast versions use the same instance structure.    
00071  * Use the function <code>arm_fir_init_q31()</code> to initialize the filter structure.    
00072  */
00073 
00074 IAR_ONLY_LOW_OPTIMIZATION_ENTER
00075 void arm_fir_fast_q31(
00076   const arm_fir_instance_q31 * S,
00077   q31_t * pSrc,
00078   q31_t * pDst,
00079   uint32_t blockSize)
00080 {
00081   q31_t *pState = S->pState;                     /* State pointer */
00082   q31_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
00083   q31_t *pStateCurnt;                            /* Points to the current sample of the state */
00084   q31_t x0, x1, x2, x3;                          /* Temporary variables to hold state */
00085   q31_t c0;                                      /* Temporary variable to hold coefficient value */
00086   q31_t *px;                                     /* Temporary pointer for state */
00087   q31_t *pb;                                     /* Temporary pointer for coefficient buffer */
00088   q31_t acc0, acc1, acc2, acc3;                  /* Accumulators */
00089   uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
00090   uint32_t i, tapCnt, blkCnt;                    /* Loop counters */
00091 
00092   /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */
00093   /* pStateCurnt points to the location where the new input data should be written */
00094   pStateCurnt = &(S->pState[(numTaps - 1u)]);
00095 
00096   /* Apply loop unrolling and compute 4 output values simultaneously.    
00097    * The variables acc0 ... acc3 hold output values that are being computed:    
00098    *    
00099    *    acc0 =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]    
00100    *    acc1 =  b[numTaps-1] * x[n-numTaps] +   b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]    
00101    *    acc2 =  b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] +   b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]    
00102    *    acc3 =  b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps]   +...+ b[0] * x[3]    
00103    */
00104   blkCnt = blockSize >> 2;
00105 
00106   /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
00107    ** a second loop below computes the remaining 1 to 3 samples. */
00108   while(blkCnt > 0u)
00109   {
00110     /* Copy four new input samples into the state buffer */
00111     *pStateCurnt++ = *pSrc++;
00112     *pStateCurnt++ = *pSrc++;
00113     *pStateCurnt++ = *pSrc++;
00114     *pStateCurnt++ = *pSrc++;
00115 
00116     /* Set all accumulators to zero */
00117     acc0 = 0;
00118     acc1 = 0;
00119     acc2 = 0;
00120     acc3 = 0;
00121 
00122     /* Initialize state pointer */
00123     px = pState;
00124 
00125     /* Initialize coefficient pointer */
00126     pb = pCoeffs;
00127 
00128     /* Read the first three samples from the state buffer:    
00129      *  x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */
00130     x0 = *(px++);
00131     x1 = *(px++);
00132     x2 = *(px++);
00133 
00134     /* Loop unrolling.  Process 4 taps at a time. */
00135     tapCnt = numTaps >> 2;
00136     i = tapCnt;
00137 
00138     while(i > 0u)
00139     {
00140       /* Read the b[numTaps] coefficient */
00141       c0 = *(pb++);
00142 
00143       /* Read x[n-numTaps-3] sample */
00144       x3 = *(px++);
00145 
00146       /* acc0 +=  b[numTaps] * x[n-numTaps] */
00147       multAcc_32x32_keep32_R(acc0, x0, c0);
00148 
00149       /* acc1 +=  b[numTaps] * x[n-numTaps-1] */
00150       multAcc_32x32_keep32_R(acc1, x1, c0);
00151 
00152       /* acc2 +=  b[numTaps] * x[n-numTaps-2] */
00153       multAcc_32x32_keep32_R(acc2, x2, c0);
00154 
00155       /* acc3 +=  b[numTaps] * x[n-numTaps-3] */
00156       multAcc_32x32_keep32_R(acc3, x3, c0);
00157 
00158       /* Read the b[numTaps-1] coefficient */
00159       c0 = *(pb++);
00160 
00161       /* Read x[n-numTaps-4] sample */
00162       x0 = *(px++);
00163 
00164       /* Perform the multiply-accumulates */      
00165       multAcc_32x32_keep32_R(acc0, x1, c0);
00166       multAcc_32x32_keep32_R(acc1, x2, c0);
00167       multAcc_32x32_keep32_R(acc2, x3, c0);
00168       multAcc_32x32_keep32_R(acc3, x0, c0);
00169 
00170       /* Read the b[numTaps-2] coefficient */
00171       c0 = *(pb++);
00172 
00173       /* Read x[n-numTaps-5] sample */
00174       x1 = *(px++);
00175 
00176       /* Perform the multiply-accumulates */      
00177       multAcc_32x32_keep32_R(acc0, x2, c0);
00178       multAcc_32x32_keep32_R(acc1, x3, c0);
00179       multAcc_32x32_keep32_R(acc2, x0, c0);
00180       multAcc_32x32_keep32_R(acc3, x1, c0);
00181 
00182       /* Read the b[numTaps-3] coefficients */
00183       c0 = *(pb++);
00184 
00185       /* Read x[n-numTaps-6] sample */
00186       x2 = *(px++);
00187 
00188       /* Perform the multiply-accumulates */      
00189       multAcc_32x32_keep32_R(acc0, x3, c0);
00190       multAcc_32x32_keep32_R(acc1, x0, c0);
00191       multAcc_32x32_keep32_R(acc2, x1, c0);
00192       multAcc_32x32_keep32_R(acc3, x2, c0);
00193       i--;
00194     }
00195 
00196     /* If the filter length is not a multiple of 4, compute the remaining filter taps */
00197 
00198     i = numTaps - (tapCnt * 4u);
00199     while(i > 0u)
00200     {
00201       /* Read coefficients */
00202       c0 = *(pb++);
00203 
00204       /* Fetch 1 state variable */
00205       x3 = *(px++);
00206 
00207       /* Perform the multiply-accumulates */      
00208       multAcc_32x32_keep32_R(acc0, x0, c0);
00209       multAcc_32x32_keep32_R(acc1, x1, c0);
00210       multAcc_32x32_keep32_R(acc2, x2, c0);
00211       multAcc_32x32_keep32_R(acc3, x3, c0);
00212 
00213       /* Reuse the present sample states for next sample */
00214       x0 = x1;
00215       x1 = x2;
00216       x2 = x3;
00217 
00218       /* Decrement the loop counter */
00219       i--;
00220     }
00221 
00222     /* Advance the state pointer by 4 to process the next group of 4 samples */
00223     pState = pState + 4;
00224 
00225     /* The results in the 4 accumulators are in 2.30 format.  Convert to 1.31    
00226      ** Then store the 4 outputs in the destination buffer. */
00227     *pDst++ = (q31_t) (acc0 << 1);
00228     *pDst++ = (q31_t) (acc1 << 1);
00229     *pDst++ = (q31_t) (acc2 << 1);
00230     *pDst++ = (q31_t) (acc3 << 1);
00231 
00232     /* Decrement the samples loop counter */
00233     blkCnt--;
00234   }
00235 
00236 
00237   /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
00238    ** No loop unrolling is used. */
00239   blkCnt = blockSize % 4u;
00240 
00241   while(blkCnt > 0u)
00242   {
00243     /* Copy one sample at a time into state buffer */
00244     *pStateCurnt++ = *pSrc++;
00245 
00246     /* Set the accumulator to zero */
00247     acc0 = 0;
00248 
00249     /* Initialize state pointer */
00250     px = pState;
00251 
00252     /* Initialize Coefficient pointer */
00253     pb = (pCoeffs);
00254 
00255     i = numTaps;
00256 
00257     /* Perform the multiply-accumulates */
00258     do
00259     {
00260       multAcc_32x32_keep32_R(acc0, (*px++), (*(pb++)));
00261       i--;
00262     } while(i > 0u);
00263 
00264     /* The result is in 2.30 format.  Convert to 1.31    
00265      ** Then store the output in the destination buffer. */
00266     *pDst++ = (q31_t) (acc0 << 1);
00267 
00268     /* Advance state pointer by 1 for the next sample */
00269     pState = pState + 1;
00270 
00271     /* Decrement the samples loop counter */
00272     blkCnt--;
00273   }
00274 
00275   /* Processing is complete.    
00276    ** Now copy the last numTaps - 1 samples to the start of the state buffer.    
00277    ** This prepares the state buffer for the next function call. */
00278 
00279   /* Points to the start of the state buffer */
00280   pStateCurnt = S->pState;
00281 
00282   /* Calculate remaining number of copies */
00283   tapCnt = (numTaps - 1u);
00284 
00285   /* Copy the remaining q31_t data */
00286   while(tapCnt > 0u)
00287   {
00288     *pStateCurnt++ = *pState++;
00289 
00290     /* Decrement the loop counter */
00291     tapCnt--;
00292   }
00293 
00294 
00295 }
00296 IAR_ONLY_LOW_OPTIMIZATION_EXIT
00297 /**    
00298  * @} end of FIR group    
00299  */