CMSIS DSP library

Dependents:   performance_timer Surfboard_ gps2rtty Capstone ... more

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_fir_decimate_fast_q31.c Source File

arm_fir_decimate_fast_q31.c

00001 /* ----------------------------------------------------------------------    
00002 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
00003 *    
00004 * $Date:        19. March 2015
00005 * $Revision:    V.1.4.5
00006 *    
00007 * Project:      CMSIS DSP Library    
00008 * Title:        arm_fir_decimate_fast_q31.c    
00009 *    
00010 * Description:  Fast Q31 FIR Decimator.    
00011 *    
00012 * Target Processor: Cortex-M4/Cortex-M3
00013 *  
00014 * Redistribution and use in source and binary forms, with or without 
00015 * modification, are permitted provided that the following conditions
00016 * are met:
00017 *   - Redistributions of source code must retain the above copyright
00018 *     notice, this list of conditions and the following disclaimer.
00019 *   - Redistributions in binary form must reproduce the above copyright
00020 *     notice, this list of conditions and the following disclaimer in
00021 *     the documentation and/or other materials provided with the 
00022 *     distribution.
00023 *   - Neither the name of ARM LIMITED nor the names of its contributors
00024 *     may be used to endorse or promote products derived from this
00025 *     software without specific prior written permission.
00026 *
00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00038 * POSSIBILITY OF SUCH DAMAGE. 
00039 * -------------------------------------------------------------------- */
00040 
00041 #include "arm_math.h"
00042 
00043 /**    
00044  * @ingroup groupFilters    
00045  */
00046 
00047 /**    
00048  * @addtogroup FIR_decimate    
00049  * @{    
00050  */
00051 
00052 /**    
00053  * @brief Processing function for the Q31 FIR decimator (fast variant) for Cortex-M3 and Cortex-M4.    
00054  * @param[in] *S points to an instance of the Q31 FIR decimator structure.    
00055  * @param[in] *pSrc points to the block of input data.    
00056  * @param[out] *pDst points to the block of output data    
00057  * @param[in] blockSize number of input samples to process per call.    
00058  * @return none    
00059  *    
00060  * <b>Scaling and Overflow Behavior:</b>    
00061  *    
00062  * \par    
00063  * This function is optimized for speed at the expense of fixed-point precision and overflow protection.    
00064  * The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format.    
00065  * These intermediate results are added to a 2.30 accumulator.    
00066  * Finally, the accumulator is saturated and converted to a 1.31 result.    
00067  * The fast version has the same overflow behavior as the standard version and provides less precision since it discards the low 32 bits of each multiplication result.    
00068  * In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits (where log2 is read as log to the base 2).    
00069  *    
00070  * \par    
00071  * Refer to the function <code>arm_fir_decimate_q31()</code> for a slower implementation of this function which uses a 64-bit accumulator to provide higher precision.    
00072  * Both the slow and the fast versions use the same instance structure.    
00073  * Use the function <code>arm_fir_decimate_init_q31()</code> to initialize the filter structure.    
00074  */
00075 
00076 void arm_fir_decimate_fast_q31(
00077   arm_fir_decimate_instance_q31 * S,
00078   q31_t * pSrc,
00079   q31_t * pDst,
00080   uint32_t blockSize)
00081 {
00082   q31_t *pState = S->pState;                     /* State pointer */
00083   q31_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
00084   q31_t *pStateCurnt;                            /* Points to the current sample of the state */
00085   q31_t x0, c0;                                  /* Temporary variables to hold state and coefficient values */
00086   q31_t *px;                                     /* Temporary pointers for state buffer */
00087   q31_t *pb;                                     /* Temporary pointers for coefficient buffer */
00088   q31_t sum0;                                    /* Accumulator */
00089   uint32_t numTaps = S->numTaps;                 /* Number of taps */
00090   uint32_t i, tapCnt, blkCnt, outBlockSize = blockSize / S->M;  /* Loop counters */
00091   uint32_t blkCntN2;
00092   q31_t x1;
00093   q31_t acc0, acc1;
00094   q31_t *px0, *px1;
00095 
00096   /* S->pState buffer contains previous frame (numTaps - 1) samples */
00097   /* pStateCurnt points to the location where the new input data should be written */
00098   pStateCurnt = S->pState + (numTaps - 1u);
00099 
00100   /* Total number of output samples to be computed */
00101 
00102   blkCnt = outBlockSize / 2;
00103   blkCntN2 = outBlockSize - (2 * blkCnt);
00104 
00105   while(blkCnt > 0u)
00106   {
00107     /* Copy decimation factor number of new input samples into the state buffer */
00108     i = 2 * S->M;
00109 
00110     do
00111     {
00112       *pStateCurnt++ = *pSrc++;
00113 
00114     } while(--i);
00115 
00116     /* Set accumulator to zero */
00117     acc0 = 0;
00118     acc1 = 0;
00119 
00120     /* Initialize state pointer */
00121     px0 = pState;
00122     px1 = pState + S->M;
00123 
00124     /* Initialize coeff pointer */
00125     pb = pCoeffs;
00126 
00127     /* Loop unrolling.  Process 4 taps at a time. */
00128     tapCnt = numTaps >> 2;
00129 
00130     /* Loop over the number of taps.  Unroll by a factor of 4.       
00131      ** Repeat until we've computed numTaps-4 coefficients. */
00132     while(tapCnt > 0u)
00133     {
00134       /* Read the b[numTaps-1] coefficient */
00135       c0 = *(pb);
00136 
00137       /* Read x[n-numTaps-1] for sample 0 sample 1 */
00138       x0 = *(px0);
00139       x1 = *(px1);
00140 
00141       /* Perform the multiply-accumulate */
00142       acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
00143       acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
00144 
00145       /* Read the b[numTaps-2] coefficient */
00146       c0 = *(pb + 1u);
00147 
00148       /* Read x[n-numTaps-2]  for sample 0 sample 1  */
00149       x0 = *(px0 + 1u);
00150       x1 = *(px1 + 1u);
00151 
00152       /* Perform the multiply-accumulate */
00153       acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
00154       acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
00155 
00156       /* Read the b[numTaps-3] coefficient */
00157       c0 = *(pb + 2u);
00158 
00159       /* Read x[n-numTaps-3]  for sample 0 sample 1 */
00160       x0 = *(px0 + 2u);
00161       x1 = *(px1 + 2u);
00162       pb += 4u;
00163 
00164       /* Perform the multiply-accumulate */
00165       acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
00166       acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
00167 
00168       /* Read the b[numTaps-4] coefficient */
00169       c0 = *(pb - 1u);
00170 
00171       /* Read x[n-numTaps-4] for sample 0 sample 1 */
00172       x0 = *(px0 + 3u);
00173       x1 = *(px1 + 3u);
00174 
00175 
00176       /* Perform the multiply-accumulate */
00177       acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
00178       acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
00179 
00180       /* update state pointers */
00181       px0 += 4u;
00182       px1 += 4u;
00183 
00184       /* Decrement the loop counter */
00185       tapCnt--;
00186     }
00187 
00188     /* If the filter length is not a multiple of 4, compute the remaining filter taps */
00189     tapCnt = numTaps % 0x4u;
00190 
00191     while(tapCnt > 0u)
00192     {
00193       /* Read coefficients */
00194       c0 = *(pb++);
00195 
00196       /* Fetch 1 state variable */
00197       x0 = *(px0++);
00198       x1 = *(px1++);
00199 
00200       /* Perform the multiply-accumulate */
00201       acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
00202       acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
00203 
00204       /* Decrement the loop counter */
00205       tapCnt--;
00206     }
00207 
00208     /* Advance the state pointer by the decimation factor       
00209      * to process the next group of decimation factor number samples */
00210     pState = pState + S->M * 2;
00211 
00212     /* The result is in the accumulator, store in the destination buffer. */
00213     *pDst++ = (q31_t) (acc0 << 1);
00214     *pDst++ = (q31_t) (acc1 << 1);
00215 
00216     /* Decrement the loop counter */
00217     blkCnt--;
00218   }
00219 
00220   while(blkCntN2 > 0u)
00221   {
00222     /* Copy decimation factor number of new input samples into the state buffer */
00223     i = S->M;
00224 
00225     do
00226     {
00227       *pStateCurnt++ = *pSrc++;
00228 
00229     } while(--i);
00230 
00231     /* Set accumulator to zero */
00232     sum0 = 0;
00233 
00234     /* Initialize state pointer */
00235     px = pState;
00236 
00237     /* Initialize coeff pointer */
00238     pb = pCoeffs;
00239 
00240     /* Loop unrolling.  Process 4 taps at a time. */
00241     tapCnt = numTaps >> 2;
00242 
00243     /* Loop over the number of taps.  Unroll by a factor of 4.       
00244      ** Repeat until we've computed numTaps-4 coefficients. */
00245     while(tapCnt > 0u)
00246     {
00247       /* Read the b[numTaps-1] coefficient */
00248       c0 = *(pb++);
00249 
00250       /* Read x[n-numTaps-1] sample */
00251       x0 = *(px++);
00252 
00253       /* Perform the multiply-accumulate */
00254       sum0 = (q31_t) ((((q63_t) sum0 << 32) + ((q63_t) x0 * c0)) >> 32);
00255 
00256       /* Read the b[numTaps-2] coefficient */
00257       c0 = *(pb++);
00258 
00259       /* Read x[n-numTaps-2] sample */
00260       x0 = *(px++);
00261 
00262       /* Perform the multiply-accumulate */
00263       sum0 = (q31_t) ((((q63_t) sum0 << 32) + ((q63_t) x0 * c0)) >> 32);
00264 
00265       /* Read the b[numTaps-3] coefficient */
00266       c0 = *(pb++);
00267 
00268       /* Read x[n-numTaps-3] sample */
00269       x0 = *(px++);
00270 
00271       /* Perform the multiply-accumulate */
00272       sum0 = (q31_t) ((((q63_t) sum0 << 32) + ((q63_t) x0 * c0)) >> 32);
00273 
00274       /* Read the b[numTaps-4] coefficient */
00275       c0 = *(pb++);
00276 
00277       /* Read x[n-numTaps-4] sample */
00278       x0 = *(px++);
00279 
00280       /* Perform the multiply-accumulate */
00281       sum0 = (q31_t) ((((q63_t) sum0 << 32) + ((q63_t) x0 * c0)) >> 32);
00282 
00283       /* Decrement the loop counter */
00284       tapCnt--;
00285     }
00286 
00287     /* If the filter length is not a multiple of 4, compute the remaining filter taps */
00288     tapCnt = numTaps % 0x4u;
00289 
00290     while(tapCnt > 0u)
00291     {
00292       /* Read coefficients */
00293       c0 = *(pb++);
00294 
00295       /* Fetch 1 state variable */
00296       x0 = *(px++);
00297 
00298       /* Perform the multiply-accumulate */
00299       sum0 = (q31_t) ((((q63_t) sum0 << 32) + ((q63_t) x0 * c0)) >> 32);
00300 
00301       /* Decrement the loop counter */
00302       tapCnt--;
00303     }
00304 
00305     /* Advance the state pointer by the decimation factor       
00306      * to process the next group of decimation factor number samples */
00307     pState = pState + S->M;
00308 
00309     /* The result is in the accumulator, store in the destination buffer. */
00310     *pDst++ = (q31_t) (sum0 << 1);
00311 
00312     /* Decrement the loop counter */
00313     blkCntN2--;
00314   }
00315 
00316   /* Processing is complete.       
00317    ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.       
00318    ** This prepares the state buffer for the next function call. */
00319 
00320   /* Points to the start of the state buffer */
00321   pStateCurnt = S->pState;
00322 
00323   i = (numTaps - 1u) >> 2u;
00324 
00325   /* copy data */
00326   while(i > 0u)
00327   {
00328     *pStateCurnt++ = *pState++;
00329     *pStateCurnt++ = *pState++;
00330     *pStateCurnt++ = *pState++;
00331     *pStateCurnt++ = *pState++;
00332 
00333     /* Decrement the loop counter */
00334     i--;
00335   }
00336 
00337   i = (numTaps - 1u) % 0x04u;
00338 
00339   /* copy data */
00340   while(i > 0u)
00341   {
00342     *pStateCurnt++ = *pState++;
00343 
00344     /* Decrement the loop counter */
00345     i--;
00346   }
00347 }
00348 
00349 /**    
00350  * @} end of FIR_decimate group    
00351  */