Robert Lopez / CMSIS5
Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_fir_fast_q31.c Source File

arm_fir_fast_q31.c

00001 /* ----------------------------------------------------------------------
00002  * Project:      CMSIS DSP Library
00003  * Title:        arm_fir_fast_q31.c
00004  * Description:  Processing function for the Q31 Fast FIR filter
00005  *
00006  * $Date:        27. January 2017
00007  * $Revision:    V.1.5.1
00008  *
00009  * Target Processor: Cortex-M cores
00010  * -------------------------------------------------------------------- */
00011 /*
00012  * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
00013  *
00014  * SPDX-License-Identifier: Apache-2.0
00015  *
00016  * Licensed under the Apache License, Version 2.0 (the License); you may
00017  * not use this file except in compliance with the License.
00018  * You may obtain a copy of the License at
00019  *
00020  * www.apache.org/licenses/LICENSE-2.0
00021  *
00022  * Unless required by applicable law or agreed to in writing, software
00023  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
00024  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00025  * See the License for the specific language governing permissions and
00026  * limitations under the License.
00027  */
00028 
00029 #include "arm_math.h"
00030 
00031 /**
00032  * @ingroup groupFilters
00033  */
00034 
00035 /**
00036  * @addtogroup FIR
00037  * @{
00038  */
00039 
00040 /**
00041  * @param[in] *S points to an instance of the Q31 structure.
00042  * @param[in] *pSrc points to the block of input data.
00043  * @param[out] *pDst points to the block output data.
00044  * @param[in] blockSize number of samples to process per call.
00045  * @return none.
00046  *
00047  * <b>Scaling and Overflow Behavior:</b>
00048  *
00049  * \par
00050  * This function is optimized for speed at the expense of fixed-point precision and overflow protection.
00051  * The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format.
00052  * These intermediate results are added to a 2.30 accumulator.
00053  * Finally, the accumulator is saturated and converted to a 1.31 result.
00054  * The fast version has the same overflow behavior as the standard version and provides less precision since it discards the low 32 bits of each multiplication result.
00055  * In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits.
00056  *
00057  * \par
00058  * Refer to the function <code>arm_fir_q31()</code> for a slower implementation of this function which uses a 64-bit accumulator to provide higher precision.  Both the slow and the fast versions use the same instance structure.
00059  * Use the function <code>arm_fir_init_q31()</code> to initialize the filter structure.
00060  */
00061 
00062 IAR_ONLY_LOW_OPTIMIZATION_ENTER
00063 void arm_fir_fast_q31(
00064   const arm_fir_instance_q31 * S,
00065   q31_t * pSrc,
00066   q31_t * pDst,
00067   uint32_t blockSize)
00068 {
00069   q31_t *pState = S->pState;                     /* State pointer */
00070   q31_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
00071   q31_t *pStateCurnt;                            /* Points to the current sample of the state */
00072   q31_t x0, x1, x2, x3;                          /* Temporary variables to hold state */
00073   q31_t c0;                                      /* Temporary variable to hold coefficient value */
00074   q31_t *px;                                     /* Temporary pointer for state */
00075   q31_t *pb;                                     /* Temporary pointer for coefficient buffer */
00076   q31_t acc0, acc1, acc2, acc3;                  /* Accumulators */
00077   uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
00078   uint32_t i, tapCnt, blkCnt;                    /* Loop counters */
00079 
00080   /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */
00081   /* pStateCurnt points to the location where the new input data should be written */
00082   pStateCurnt = &(S->pState[(numTaps - 1U)]);
00083 
00084   /* Apply loop unrolling and compute 4 output values simultaneously.
00085    * The variables acc0 ... acc3 hold output values that are being computed:
00086    *
00087    *    acc0 =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
00088    *    acc1 =  b[numTaps-1] * x[n-numTaps] +   b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
00089    *    acc2 =  b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] +   b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
00090    *    acc3 =  b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps]   +...+ b[0] * x[3]
00091    */
00092   blkCnt = blockSize >> 2;
00093 
00094   /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
00095    ** a second loop below computes the remaining 1 to 3 samples. */
00096   while (blkCnt > 0U)
00097   {
00098     /* Copy four new input samples into the state buffer */
00099     *pStateCurnt++ = *pSrc++;
00100     *pStateCurnt++ = *pSrc++;
00101     *pStateCurnt++ = *pSrc++;
00102     *pStateCurnt++ = *pSrc++;
00103 
00104     /* Set all accumulators to zero */
00105     acc0 = 0;
00106     acc1 = 0;
00107     acc2 = 0;
00108     acc3 = 0;
00109 
00110     /* Initialize state pointer */
00111     px = pState;
00112 
00113     /* Initialize coefficient pointer */
00114     pb = pCoeffs;
00115 
00116     /* Read the first three samples from the state buffer:
00117      *  x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */
00118     x0 = *(px++);
00119     x1 = *(px++);
00120     x2 = *(px++);
00121 
00122     /* Loop unrolling.  Process 4 taps at a time. */
00123     tapCnt = numTaps >> 2;
00124     i = tapCnt;
00125 
00126     while (i > 0U)
00127     {
00128       /* Read the b[numTaps] coefficient */
00129       c0 = *pb;
00130 
00131       /* Read x[n-numTaps-3] sample */
00132       x3 = *px;
00133 
00134       /* acc0 +=  b[numTaps] * x[n-numTaps] */
00135       multAcc_32x32_keep32_R(acc0, x0, c0);
00136 
00137       /* acc1 +=  b[numTaps] * x[n-numTaps-1] */
00138       multAcc_32x32_keep32_R(acc1, x1, c0);
00139 
00140       /* acc2 +=  b[numTaps] * x[n-numTaps-2] */
00141       multAcc_32x32_keep32_R(acc2, x2, c0);
00142 
00143       /* acc3 +=  b[numTaps] * x[n-numTaps-3] */
00144       multAcc_32x32_keep32_R(acc3, x3, c0);
00145 
00146       /* Read the b[numTaps-1] coefficient */
00147       c0 = *(pb + 1U);
00148 
00149       /* Read x[n-numTaps-4] sample */
00150       x0 = *(px + 1U);
00151 
00152       /* Perform the multiply-accumulates */
00153       multAcc_32x32_keep32_R(acc0, x1, c0);
00154       multAcc_32x32_keep32_R(acc1, x2, c0);
00155       multAcc_32x32_keep32_R(acc2, x3, c0);
00156       multAcc_32x32_keep32_R(acc3, x0, c0);
00157 
00158       /* Read the b[numTaps-2] coefficient */
00159       c0 = *(pb + 2U);
00160 
00161       /* Read x[n-numTaps-5] sample */
00162       x1 = *(px + 2U);
00163 
00164       /* Perform the multiply-accumulates */
00165       multAcc_32x32_keep32_R(acc0, x2, c0);
00166       multAcc_32x32_keep32_R(acc1, x3, c0);
00167       multAcc_32x32_keep32_R(acc2, x0, c0);
00168       multAcc_32x32_keep32_R(acc3, x1, c0);
00169 
00170       /* Read the b[numTaps-3] coefficients */
00171       c0 = *(pb + 3U);
00172 
00173       /* Read x[n-numTaps-6] sample */
00174       x2 = *(px + 3U);
00175 
00176       /* Perform the multiply-accumulates */
00177       multAcc_32x32_keep32_R(acc0, x3, c0);
00178       multAcc_32x32_keep32_R(acc1, x0, c0);
00179       multAcc_32x32_keep32_R(acc2, x1, c0);
00180       multAcc_32x32_keep32_R(acc3, x2, c0);
00181 
00182       /* update coefficient pointer */
00183       pb += 4U;
00184       px += 4U;
00185 
00186       /* Decrement the loop counter */
00187       i--;
00188     }
00189 
00190     /* If the filter length is not a multiple of 4, compute the remaining filter taps */
00191 
00192     i = numTaps - (tapCnt * 4U);
00193     while (i > 0U)
00194     {
00195       /* Read coefficients */
00196       c0 = *(pb++);
00197 
00198       /* Fetch 1 state variable */
00199       x3 = *(px++);
00200 
00201       /* Perform the multiply-accumulates */
00202       multAcc_32x32_keep32_R(acc0, x0, c0);
00203       multAcc_32x32_keep32_R(acc1, x1, c0);
00204       multAcc_32x32_keep32_R(acc2, x2, c0);
00205       multAcc_32x32_keep32_R(acc3, x3, c0);
00206 
00207       /* Reuse the present sample states for next sample */
00208       x0 = x1;
00209       x1 = x2;
00210       x2 = x3;
00211 
00212       /* Decrement the loop counter */
00213       i--;
00214     }
00215 
00216     /* Advance the state pointer by 4 to process the next group of 4 samples */
00217     pState = pState + 4;
00218 
00219     /* The results in the 4 accumulators are in 2.30 format.  Convert to 1.31
00220      ** Then store the 4 outputs in the destination buffer. */
00221     *pDst++ = (q31_t) (acc0 << 1);
00222     *pDst++ = (q31_t) (acc1 << 1);
00223     *pDst++ = (q31_t) (acc2 << 1);
00224     *pDst++ = (q31_t) (acc3 << 1);
00225 
00226     /* Decrement the samples loop counter */
00227     blkCnt--;
00228   }
00229 
00230 
00231   /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
00232    ** No loop unrolling is used. */
00233   blkCnt = blockSize % 4U;
00234 
00235   while (blkCnt > 0U)
00236   {
00237     /* Copy one sample at a time into state buffer */
00238     *pStateCurnt++ = *pSrc++;
00239 
00240     /* Set the accumulator to zero */
00241     acc0 = 0;
00242 
00243     /* Initialize state pointer */
00244     px = pState;
00245 
00246     /* Initialize Coefficient pointer */
00247     pb = (pCoeffs);
00248 
00249     i = numTaps;
00250 
00251     /* Perform the multiply-accumulates */
00252     do
00253     {
00254       multAcc_32x32_keep32_R(acc0, (*px++), (*(pb++)));
00255       i--;
00256     } while (i > 0U);
00257 
00258     /* The result is in 2.30 format.  Convert to 1.31
00259      ** Then store the output in the destination buffer. */
00260     *pDst++ = (q31_t) (acc0 << 1);
00261 
00262     /* Advance state pointer by 1 for the next sample */
00263     pState = pState + 1;
00264 
00265     /* Decrement the samples loop counter */
00266     blkCnt--;
00267   }
00268 
00269   /* Processing is complete.
00270    ** Now copy the last numTaps - 1 samples to the start of the state buffer.
00271    ** This prepares the state buffer for the next function call. */
00272 
00273   /* Points to the start of the state buffer */
00274   pStateCurnt = S->pState;
00275 
00276   /* Calculate remaining number of copies */
00277   tapCnt = (numTaps - 1U);
00278 
00279   /* Copy the remaining q31_t data */
00280   while (tapCnt > 0U)
00281   {
00282     *pStateCurnt++ = *pState++;
00283 
00284     /* Decrement the loop counter */
00285     tapCnt--;
00286   }
00287 
00288 
00289 }
00290 IAR_ONLY_LOW_OPTIMIZATION_EXIT
00291 /**
00292  * @} end of FIR group
00293  */
00294