CMSIS DSP library

Dependents:   performance_timer Surfboard_ gps2rtty Capstone ... more

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_fir_fast_q31.c Source File

arm_fir_fast_q31.c

00001 /* ----------------------------------------------------------------------    
00002 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
00003 *    
00004 * $Date:        19. March 2015 
00005 * $Revision:    V.1.4.5  
00006 *    
00007 * Project:      CMSIS DSP Library    
00008 * Title:        arm_fir_fast_q31.c    
00009 *    
00010 * Description:  Processing function for the Q31 Fast FIR filter.    
00011 *    
00012 * Target Processor: Cortex-M4/Cortex-M3
00013 *  
00014 * Redistribution and use in source and binary forms, with or without 
00015 * modification, are permitted provided that the following conditions
00016 * are met:
00017 *   - Redistributions of source code must retain the above copyright
00018 *     notice, this list of conditions and the following disclaimer.
00019 *   - Redistributions in binary form must reproduce the above copyright
00020 *     notice, this list of conditions and the following disclaimer in
00021 *     the documentation and/or other materials provided with the 
00022 *     distribution.
00023 *   - Neither the name of ARM LIMITED nor the names of its contributors
00024 *     may be used to endorse or promote products derived from this
00025 *     software without specific prior written permission.
00026 *
00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00038 * POSSIBILITY OF SUCH DAMAGE.    
00039 * -------------------------------------------------------------------- */
00040 
00041 #include "arm_math.h"
00042 
00043 /**    
00044  * @ingroup groupFilters    
00045  */
00046 
00047 /**    
00048  * @addtogroup FIR    
00049  * @{    
00050  */
00051 
00052 /**    
00053  * @param[in] *S points to an instance of the Q31 structure.    
00054  * @param[in] *pSrc points to the block of input data.    
00055  * @param[out] *pDst points to the block output data.    
00056  * @param[in] blockSize number of samples to process per call.    
00057  * @return none.    
00058  *    
00059  * <b>Scaling and Overflow Behavior:</b>    
00060  *    
00061  * \par    
00062  * This function is optimized for speed at the expense of fixed-point precision and overflow protection.    
00063  * The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format.    
00064  * These intermediate results are added to a 2.30 accumulator.    
00065  * Finally, the accumulator is saturated and converted to a 1.31 result.    
00066  * The fast version has the same overflow behavior as the standard version and provides less precision since it discards the low 32 bits of each multiplication result.    
00067  * In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits.    
00068  *    
00069  * \par    
00070  * Refer to the function <code>arm_fir_q31()</code> for a slower implementation of this function which uses a 64-bit accumulator to provide higher precision.  Both the slow and the fast versions use the same instance structure.    
00071  * Use the function <code>arm_fir_init_q31()</code> to initialize the filter structure.    
00072  */
00073 
00074 IAR_ONLY_LOW_OPTIMIZATION_ENTER
00075 void arm_fir_fast_q31(
00076   const arm_fir_instance_q31 * S,
00077   q31_t * pSrc,
00078   q31_t * pDst,
00079   uint32_t blockSize)
00080 {
00081   q31_t *pState = S->pState;                     /* State pointer */
00082   q31_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
00083   q31_t *pStateCurnt;                            /* Points to the current sample of the state */
00084   q31_t x0, x1, x2, x3;                          /* Temporary variables to hold state */
00085   q31_t c0;                                      /* Temporary variable to hold coefficient value */
00086   q31_t *px;                                     /* Temporary pointer for state */
00087   q31_t *pb;                                     /* Temporary pointer for coefficient buffer */
00088   q31_t acc0, acc1, acc2, acc3;                  /* Accumulators */
00089   uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
00090   uint32_t i, tapCnt, blkCnt;                    /* Loop counters */
00091 
00092   /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */
00093   /* pStateCurnt points to the location where the new input data should be written */
00094   pStateCurnt = &(S->pState[(numTaps - 1u)]);
00095 
00096   /* Apply loop unrolling and compute 4 output values simultaneously.    
00097    * The variables acc0 ... acc3 hold output values that are being computed:    
00098    *    
00099    *    acc0 =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]    
00100    *    acc1 =  b[numTaps-1] * x[n-numTaps] +   b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]    
00101    *    acc2 =  b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] +   b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]    
00102    *    acc3 =  b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps]   +...+ b[0] * x[3]    
00103    */
00104   blkCnt = blockSize >> 2;
00105 
00106   /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
00107    ** a second loop below computes the remaining 1 to 3 samples. */
00108   while(blkCnt > 0u)
00109   {
00110     /* Copy four new input samples into the state buffer */
00111     *pStateCurnt++ = *pSrc++;
00112     *pStateCurnt++ = *pSrc++;
00113     *pStateCurnt++ = *pSrc++;
00114     *pStateCurnt++ = *pSrc++;
00115 
00116     /* Set all accumulators to zero */
00117     acc0 = 0;
00118     acc1 = 0;
00119     acc2 = 0;
00120     acc3 = 0;
00121 
00122     /* Initialize state pointer */
00123     px = pState;
00124 
00125     /* Initialize coefficient pointer */
00126     pb = pCoeffs;
00127 
00128     /* Read the first three samples from the state buffer:    
00129      *  x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */
00130     x0 = *(px++);
00131     x1 = *(px++);
00132     x2 = *(px++);
00133 
00134     /* Loop unrolling.  Process 4 taps at a time. */
00135     tapCnt = numTaps >> 2;
00136     i = tapCnt;
00137 
00138     while(i > 0u)
00139     {
00140       /* Read the b[numTaps] coefficient */
00141       c0 = *pb;
00142 
00143       /* Read x[n-numTaps-3] sample */
00144       x3 = *px;
00145 
00146       /* acc0 +=  b[numTaps] * x[n-numTaps] */
00147       multAcc_32x32_keep32_R(acc0, x0, c0);
00148 
00149       /* acc1 +=  b[numTaps] * x[n-numTaps-1] */
00150       multAcc_32x32_keep32_R(acc1, x1, c0);
00151 
00152       /* acc2 +=  b[numTaps] * x[n-numTaps-2] */
00153       multAcc_32x32_keep32_R(acc2, x2, c0);
00154 
00155       /* acc3 +=  b[numTaps] * x[n-numTaps-3] */
00156       multAcc_32x32_keep32_R(acc3, x3, c0);
00157 
00158       /* Read the b[numTaps-1] coefficient */
00159       c0 = *(pb + 1u);
00160 
00161       /* Read x[n-numTaps-4] sample */
00162       x0 = *(px + 1u);
00163 
00164       /* Perform the multiply-accumulates */      
00165       multAcc_32x32_keep32_R(acc0, x1, c0);
00166       multAcc_32x32_keep32_R(acc1, x2, c0);
00167       multAcc_32x32_keep32_R(acc2, x3, c0);
00168       multAcc_32x32_keep32_R(acc3, x0, c0);
00169 
00170       /* Read the b[numTaps-2] coefficient */
00171       c0 = *(pb + 2u);
00172 
00173       /* Read x[n-numTaps-5] sample */
00174       x1 = *(px + 2u);
00175 
00176       /* Perform the multiply-accumulates */      
00177       multAcc_32x32_keep32_R(acc0, x2, c0);
00178       multAcc_32x32_keep32_R(acc1, x3, c0);
00179       multAcc_32x32_keep32_R(acc2, x0, c0);
00180       multAcc_32x32_keep32_R(acc3, x1, c0);
00181 
00182       /* Read the b[numTaps-3] coefficients */
00183       c0 = *(pb + 3u);
00184 
00185       /* Read x[n-numTaps-6] sample */
00186       x2 = *(px + 3u);
00187 
00188       /* Perform the multiply-accumulates */      
00189       multAcc_32x32_keep32_R(acc0, x3, c0);
00190       multAcc_32x32_keep32_R(acc1, x0, c0);
00191       multAcc_32x32_keep32_R(acc2, x1, c0);
00192       multAcc_32x32_keep32_R(acc3, x2, c0);
00193 
00194       /* update coefficient pointer */
00195       pb += 4u;
00196       px += 4u;
00197       
00198       /* Decrement the loop counter */
00199       i--;
00200     }
00201 
00202     /* If the filter length is not a multiple of 4, compute the remaining filter taps */
00203 
00204     i = numTaps - (tapCnt * 4u);
00205     while(i > 0u)
00206     {
00207       /* Read coefficients */
00208       c0 = *(pb++);
00209 
00210       /* Fetch 1 state variable */
00211       x3 = *(px++);
00212 
00213       /* Perform the multiply-accumulates */      
00214       multAcc_32x32_keep32_R(acc0, x0, c0);
00215       multAcc_32x32_keep32_R(acc1, x1, c0);
00216       multAcc_32x32_keep32_R(acc2, x2, c0);
00217       multAcc_32x32_keep32_R(acc3, x3, c0);
00218 
00219       /* Reuse the present sample states for next sample */
00220       x0 = x1;
00221       x1 = x2;
00222       x2 = x3;
00223 
00224       /* Decrement the loop counter */
00225       i--;
00226     }
00227 
00228     /* Advance the state pointer by 4 to process the next group of 4 samples */
00229     pState = pState + 4;
00230 
00231     /* The results in the 4 accumulators are in 2.30 format.  Convert to 1.31    
00232      ** Then store the 4 outputs in the destination buffer. */
00233     *pDst++ = (q31_t) (acc0 << 1);
00234     *pDst++ = (q31_t) (acc1 << 1);
00235     *pDst++ = (q31_t) (acc2 << 1);
00236     *pDst++ = (q31_t) (acc3 << 1);
00237 
00238     /* Decrement the samples loop counter */
00239     blkCnt--;
00240   }
00241 
00242 
00243   /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
00244    ** No loop unrolling is used. */
00245   blkCnt = blockSize % 4u;
00246 
00247   while(blkCnt > 0u)
00248   {
00249     /* Copy one sample at a time into state buffer */
00250     *pStateCurnt++ = *pSrc++;
00251 
00252     /* Set the accumulator to zero */
00253     acc0 = 0;
00254 
00255     /* Initialize state pointer */
00256     px = pState;
00257 
00258     /* Initialize Coefficient pointer */
00259     pb = (pCoeffs);
00260 
00261     i = numTaps;
00262 
00263     /* Perform the multiply-accumulates */
00264     do
00265     {
00266       multAcc_32x32_keep32_R(acc0, (*px++), (*(pb++)));
00267       i--;
00268     } while(i > 0u);
00269 
00270     /* The result is in 2.30 format.  Convert to 1.31    
00271      ** Then store the output in the destination buffer. */
00272     *pDst++ = (q31_t) (acc0 << 1);
00273 
00274     /* Advance state pointer by 1 for the next sample */
00275     pState = pState + 1;
00276 
00277     /* Decrement the samples loop counter */
00278     blkCnt--;
00279   }
00280 
00281   /* Processing is complete.    
00282    ** Now copy the last numTaps - 1 samples to the start of the state buffer.    
00283    ** This prepares the state buffer for the next function call. */
00284 
00285   /* Points to the start of the state buffer */
00286   pStateCurnt = S->pState;
00287 
00288   /* Calculate remaining number of copies */
00289   tapCnt = (numTaps - 1u);
00290 
00291   /* Copy the remaining q31_t data */
00292   while(tapCnt > 0u)
00293   {
00294     *pStateCurnt++ = *pState++;
00295 
00296     /* Decrement the loop counter */
00297     tapCnt--;
00298   }
00299 
00300 
00301 }
00302 IAR_ONLY_LOW_OPTIMIZATION_EXIT
00303 /**    
00304  * @} end of FIR group    
00305  */