Aded CMSIS5 DSP and NN folder. Needs some work

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_fir_f32.c Source File

arm_fir_f32.c

00001 /* ----------------------------------------------------------------------
00002  * Project:      CMSIS DSP Library
00003  * Title:        arm_fir_f32.c
00004  * Description:  Floating-point FIR filter processing function
00005  *
00006  * $Date:        27. January 2017
00007  * $Revision:    V.1.5.1
00008  *
00009  * Target Processor: Cortex-M cores
00010  * -------------------------------------------------------------------- */
00011 /*
00012  * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
00013  *
00014  * SPDX-License-Identifier: Apache-2.0
00015  *
00016  * Licensed under the Apache License, Version 2.0 (the License); you may
00017  * not use this file except in compliance with the License.
00018  * You may obtain a copy of the License at
00019  *
00020  * www.apache.org/licenses/LICENSE-2.0
00021  *
00022  * Unless required by applicable law or agreed to in writing, software
00023  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
00024  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00025  * See the License for the specific language governing permissions and
00026  * limitations under the License.
00027  */
00028 
00029 #include "arm_math.h"
00030 
00031 /**
00032 * @ingroup groupFilters
00033 */
00034 
00035 /**
00036 * @defgroup FIR Finite Impulse Response (FIR) Filters
00037 *
00038 * This set of functions implements Finite Impulse Response (FIR) filters
00039 * for Q7, Q15, Q31, and floating-point data types.  Fast versions of Q15 and Q31 are also provided.
00040 * The functions operate on blocks of input and output data and each call to the function processes
00041 * <code>blockSize</code> samples through the filter.  <code>pSrc</code> and
00042 * <code>pDst</code> points to input and output arrays containing <code>blockSize</code> values.
00043 *
00044 * \par Algorithm:
00045 * The FIR filter algorithm is based upon a sequence of multiply-accumulate (MAC) operations.
00046 * Each filter coefficient <code>b[n]</code> is multiplied by a state variable which equals a previous input sample <code>x[n]</code>.
00047 * <pre>
00048 *    y[n] = b[0] * x[n] + b[1] * x[n-1] + b[2] * x[n-2] + ...+ b[numTaps-1] * x[n-numTaps+1]
00049 * </pre>
00050 * \par
00051 * \image html FIR.gif "Finite Impulse Response filter"
00052 * \par
00053 * <code>pCoeffs</code> points to a coefficient array of size <code>numTaps</code>.
00054 * Coefficients are stored in time reversed order.
00055 * \par
00056 * <pre>
00057 *    {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
00058 * </pre>
00059 * \par
00060 * <code>pState</code> points to a state array of size <code>numTaps + blockSize - 1</code>.
00061 * Samples in the state buffer are stored in the following order.
00062 * \par
00063 * <pre>
00064 *    {x[n-numTaps+1], x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2]....x[0], x[1], ..., x[blockSize-1]}
00065 * </pre>
00066 * \par
00067 * Note that the length of the state buffer exceeds the length of the coefficient array by <code>blockSize-1</code>.
00068 * The increased state buffer length allows circular addressing, which is traditionally used in the FIR filters,
00069 * to be avoided and yields a significant speed improvement.
00070 * The state variables are updated after each block of data is processed; the coefficients are untouched.
00071 * \par Instance Structure
00072 * The coefficients and state variables for a filter are stored together in an instance data structure.
00073 * A separate instance structure must be defined for each filter.
00074 * Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
00075 * There are separate instance structure declarations for each of the 4 supported data types.
00076 *
00077 * \par Initialization Functions
00078 * There is also an associated initialization function for each data type.
00079 * The initialization function performs the following operations:
00080 * - Sets the values of the internal structure fields.
00081 * - Zeros out the values in the state buffer.
00082 * To do this manually without calling the init function, assign the follow subfields of the instance structure:
00083 * numTaps, pCoeffs, pState. Also set all of the values in pState to zero.
00084 *
00085 * \par
00086 * Use of the initialization function is optional.
00087 * However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
00088 * To place an instance structure into a const data section, the instance structure must be manually initialized.
00089 * Set the values in the state buffer to zeros before static initialization.
00090 * The code below statically initializes each of the 4 different data type filter instance structures
00091 * <pre>
00092 *arm_fir_instance_f32 S = {numTaps, pState, pCoeffs};
00093 *arm_fir_instance_q31 S = {numTaps, pState, pCoeffs};
00094 *arm_fir_instance_q15 S = {numTaps, pState, pCoeffs};
00095 *arm_fir_instance_q7 S =  {numTaps, pState, pCoeffs};
00096 * </pre>
00097 *
00098 * where <code>numTaps</code> is the number of filter coefficients in the filter; <code>pState</code> is the address of the state buffer;
00099 * <code>pCoeffs</code> is the address of the coefficient buffer.
00100 *
00101 * \par Fixed-Point Behavior
00102 * Care must be taken when using the fixed-point versions of the FIR filter functions.
00103 * In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
00104 * Refer to the function specific documentation below for usage guidelines.
00105 */
00106 
00107 /**
00108 * @addtogroup FIR
00109 * @{
00110 */
00111 
00112 /**
00113 *
00114 * @param[in]  *S points to an instance of the floating-point FIR filter structure.
00115 * @param[in]  *pSrc points to the block of input data.
00116 * @param[out] *pDst points to the block of output data.
00117 * @param[in]  blockSize number of samples to process per call.
00118 * @return     none.
00119 *
00120 */
00121 
00122 #if defined(ARM_MATH_CM7)
00123 
00124 void arm_fir_f32(
00125 const arm_fir_instance_f32 * S,
00126 float32_t * pSrc,
00127 float32_t * pDst,
00128 uint32_t blockSize)
00129 {
00130    float32_t *pState = S->pState;                 /* State pointer */
00131    float32_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
00132    float32_t *pStateCurnt;                        /* Points to the current sample of the state */
00133    float32_t *px, *pb;                            /* Temporary pointers for state and coefficient buffers */
00134    float32_t acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;     /* Accumulators */
00135    float32_t x0, x1, x2, x3, x4, x5, x6, x7, c0;  /* Temporary variables to hold state and coefficient values */
00136    uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
00137    uint32_t i, tapCnt, blkCnt;                    /* Loop counters */
00138 
00139    /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
00140    /* pStateCurnt points to the location where the new input data should be written */
00141    pStateCurnt = &(S->pState[(numTaps - 1U)]);
00142 
00143    /* Apply loop unrolling and compute 8 output values simultaneously.
00144     * The variables acc0 ... acc7 hold output values that are being computed:
00145     *
00146     *    acc0 =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
00147     *    acc1 =  b[numTaps-1] * x[n-numTaps] +   b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
00148     *    acc2 =  b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] +   b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
00149     *    acc3 =  b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps]   +...+ b[0] * x[3]
00150     */
00151    blkCnt = blockSize >> 3;
00152 
00153    /* First part of the processing with loop unrolling.  Compute 8 outputs at a time.
00154    ** a second loop below computes the remaining 1 to 7 samples. */
00155    while (blkCnt > 0U)
00156    {
00157       /* Copy four new input samples into the state buffer */
00158       *pStateCurnt++ = *pSrc++;
00159       *pStateCurnt++ = *pSrc++;
00160       *pStateCurnt++ = *pSrc++;
00161       *pStateCurnt++ = *pSrc++;
00162 
00163       /* Set all accumulators to zero */
00164       acc0 = 0.0f;
00165       acc1 = 0.0f;
00166       acc2 = 0.0f;
00167       acc3 = 0.0f;
00168       acc4 = 0.0f;
00169       acc5 = 0.0f;
00170       acc6 = 0.0f;
00171       acc7 = 0.0f;
00172 
00173       /* Initialize state pointer */
00174       px = pState;
00175 
00176       /* Initialize coeff pointer */
00177       pb = (pCoeffs);
00178 
00179       /* This is separated from the others to avoid
00180        * a call to __aeabi_memmove which would be slower
00181        */
00182       *pStateCurnt++ = *pSrc++;
00183       *pStateCurnt++ = *pSrc++;
00184       *pStateCurnt++ = *pSrc++;
00185       *pStateCurnt++ = *pSrc++;
00186 
00187       /* Read the first seven samples from the state buffer:  x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */
00188       x0 = *px++;
00189       x1 = *px++;
00190       x2 = *px++;
00191       x3 = *px++;
00192       x4 = *px++;
00193       x5 = *px++;
00194       x6 = *px++;
00195 
00196       /* Loop unrolling.  Process 8 taps at a time. */
00197       tapCnt = numTaps >> 3U;
00198 
00199       /* Loop over the number of taps.  Unroll by a factor of 8.
00200        ** Repeat until we've computed numTaps-8 coefficients. */
00201       while (tapCnt > 0U)
00202       {
00203          /* Read the b[numTaps-1] coefficient */
00204          c0 = *(pb++);
00205 
00206          /* Read x[n-numTaps-3] sample */
00207          x7 = *(px++);
00208 
00209          /* acc0 +=  b[numTaps-1] * x[n-numTaps] */
00210          acc0 += x0 * c0;
00211 
00212          /* acc1 +=  b[numTaps-1] * x[n-numTaps-1] */
00213          acc1 += x1 * c0;
00214 
00215          /* acc2 +=  b[numTaps-1] * x[n-numTaps-2] */
00216          acc2 += x2 * c0;
00217 
00218          /* acc3 +=  b[numTaps-1] * x[n-numTaps-3] */
00219          acc3 += x3 * c0;
00220 
00221          /* acc4 +=  b[numTaps-1] * x[n-numTaps-4] */
00222          acc4 += x4 * c0;
00223 
00224          /* acc1 +=  b[numTaps-1] * x[n-numTaps-5] */
00225          acc5 += x5 * c0;
00226 
00227          /* acc2 +=  b[numTaps-1] * x[n-numTaps-6] */
00228          acc6 += x6 * c0;
00229 
00230          /* acc3 +=  b[numTaps-1] * x[n-numTaps-7] */
00231          acc7 += x7 * c0;
00232 
00233          /* Read the b[numTaps-2] coefficient */
00234          c0 = *(pb++);
00235 
00236          /* Read x[n-numTaps-4] sample */
00237          x0 = *(px++);
00238 
00239          /* Perform the multiply-accumulate */
00240          acc0 += x1 * c0;
00241          acc1 += x2 * c0;
00242          acc2 += x3 * c0;
00243          acc3 += x4 * c0;
00244          acc4 += x5 * c0;
00245          acc5 += x6 * c0;
00246          acc6 += x7 * c0;
00247          acc7 += x0 * c0;
00248 
00249          /* Read the b[numTaps-3] coefficient */
00250          c0 = *(pb++);
00251 
00252          /* Read x[n-numTaps-5] sample */
00253          x1 = *(px++);
00254 
00255          /* Perform the multiply-accumulates */
00256          acc0 += x2 * c0;
00257          acc1 += x3 * c0;
00258          acc2 += x4 * c0;
00259          acc3 += x5 * c0;
00260          acc4 += x6 * c0;
00261          acc5 += x7 * c0;
00262          acc6 += x0 * c0;
00263          acc7 += x1 * c0;
00264 
00265          /* Read the b[numTaps-4] coefficient */
00266          c0 = *(pb++);
00267 
00268          /* Read x[n-numTaps-6] sample */
00269          x2 = *(px++);
00270 
00271          /* Perform the multiply-accumulates */
00272          acc0 += x3 * c0;
00273          acc1 += x4 * c0;
00274          acc2 += x5 * c0;
00275          acc3 += x6 * c0;
00276          acc4 += x7 * c0;
00277          acc5 += x0 * c0;
00278          acc6 += x1 * c0;
00279          acc7 += x2 * c0;
00280 
00281          /* Read the b[numTaps-4] coefficient */
00282          c0 = *(pb++);
00283 
00284          /* Read x[n-numTaps-6] sample */
00285          x3 = *(px++);
00286          /* Perform the multiply-accumulates */
00287          acc0 += x4 * c0;
00288          acc1 += x5 * c0;
00289          acc2 += x6 * c0;
00290          acc3 += x7 * c0;
00291          acc4 += x0 * c0;
00292          acc5 += x1 * c0;
00293          acc6 += x2 * c0;
00294          acc7 += x3 * c0;
00295 
00296          /* Read the b[numTaps-4] coefficient */
00297          c0 = *(pb++);
00298 
00299          /* Read x[n-numTaps-6] sample */
00300          x4 = *(px++);
00301 
00302          /* Perform the multiply-accumulates */
00303          acc0 += x5 * c0;
00304          acc1 += x6 * c0;
00305          acc2 += x7 * c0;
00306          acc3 += x0 * c0;
00307          acc4 += x1 * c0;
00308          acc5 += x2 * c0;
00309          acc6 += x3 * c0;
00310          acc7 += x4 * c0;
00311 
00312          /* Read the b[numTaps-4] coefficient */
00313          c0 = *(pb++);
00314 
00315          /* Read x[n-numTaps-6] sample */
00316          x5 = *(px++);
00317 
00318          /* Perform the multiply-accumulates */
00319          acc0 += x6 * c0;
00320          acc1 += x7 * c0;
00321          acc2 += x0 * c0;
00322          acc3 += x1 * c0;
00323          acc4 += x2 * c0;
00324          acc5 += x3 * c0;
00325          acc6 += x4 * c0;
00326          acc7 += x5 * c0;
00327 
00328          /* Read the b[numTaps-4] coefficient */
00329          c0 = *(pb++);
00330 
00331          /* Read x[n-numTaps-6] sample */
00332          x6 = *(px++);
00333 
00334          /* Perform the multiply-accumulates */
00335          acc0 += x7 * c0;
00336          acc1 += x0 * c0;
00337          acc2 += x1 * c0;
00338          acc3 += x2 * c0;
00339          acc4 += x3 * c0;
00340          acc5 += x4 * c0;
00341          acc6 += x5 * c0;
00342          acc7 += x6 * c0;
00343 
00344          tapCnt--;
00345       }
00346 
00347       /* If the filter length is not a multiple of 8, compute the remaining filter taps */
00348       tapCnt = numTaps % 0x8U;
00349 
00350       while (tapCnt > 0U)
00351       {
00352          /* Read coefficients */
00353          c0 = *(pb++);
00354 
00355          /* Fetch 1 state variable */
00356          x7 = *(px++);
00357 
00358          /* Perform the multiply-accumulates */
00359          acc0 += x0 * c0;
00360          acc1 += x1 * c0;
00361          acc2 += x2 * c0;
00362          acc3 += x3 * c0;
00363          acc4 += x4 * c0;
00364          acc5 += x5 * c0;
00365          acc6 += x6 * c0;
00366          acc7 += x7 * c0;
00367 
00368          /* Reuse the present sample states for next sample */
00369          x0 = x1;
00370          x1 = x2;
00371          x2 = x3;
00372          x3 = x4;
00373          x4 = x5;
00374          x5 = x6;
00375          x6 = x7;
00376 
00377          /* Decrement the loop counter */
00378          tapCnt--;
00379       }
00380 
00381       /* Advance the state pointer by 8 to process the next group of 8 samples */
00382       pState = pState + 8;
00383 
00384       /* The results in the 8 accumulators, store in the destination buffer. */
00385       *pDst++ = acc0;
00386       *pDst++ = acc1;
00387       *pDst++ = acc2;
00388       *pDst++ = acc3;
00389       *pDst++ = acc4;
00390       *pDst++ = acc5;
00391       *pDst++ = acc6;
00392       *pDst++ = acc7;
00393 
00394       blkCnt--;
00395    }
00396 
00397    /* If the blockSize is not a multiple of 8, compute any remaining output samples here.
00398    ** No loop unrolling is used. */
00399    blkCnt = blockSize % 0x8U;
00400 
00401    while (blkCnt > 0U)
00402    {
00403       /* Copy one sample at a time into state buffer */
00404       *pStateCurnt++ = *pSrc++;
00405 
00406       /* Set the accumulator to zero */
00407       acc0 = 0.0f;
00408 
00409       /* Initialize state pointer */
00410       px = pState;
00411 
00412       /* Initialize Coefficient pointer */
00413       pb = (pCoeffs);
00414 
00415       i = numTaps;
00416 
00417       /* Perform the multiply-accumulates */
00418       do
00419       {
00420          acc0 += *px++ * *pb++;
00421          i--;
00422 
00423       } while (i > 0U);
00424 
00425       /* The result is store in the destination buffer. */
00426       *pDst++ = acc0;
00427 
00428       /* Advance state pointer by 1 for the next sample */
00429       pState = pState + 1;
00430 
00431       blkCnt--;
00432    }
00433 
00434    /* Processing is complete.
00435    ** Now copy the last numTaps - 1 samples to the start of the state buffer.
00436    ** This prepares the state buffer for the next function call. */
00437 
00438    /* Points to the start of the state buffer */
00439    pStateCurnt = S->pState;
00440 
00441    tapCnt = (numTaps - 1U) >> 2U;
00442 
00443    /* copy data */
00444    while (tapCnt > 0U)
00445    {
00446       *pStateCurnt++ = *pState++;
00447       *pStateCurnt++ = *pState++;
00448       *pStateCurnt++ = *pState++;
00449       *pStateCurnt++ = *pState++;
00450 
00451       /* Decrement the loop counter */
00452       tapCnt--;
00453    }
00454 
00455    /* Calculate remaining number of copies */
00456    tapCnt = (numTaps - 1U) % 0x4U;
00457 
00458    /* Copy the remaining q31_t data */
00459    while (tapCnt > 0U)
00460    {
00461       *pStateCurnt++ = *pState++;
00462 
00463       /* Decrement the loop counter */
00464       tapCnt--;
00465    }
00466 }
00467 
00468 #elif defined(ARM_MATH_CM0_FAMILY)
00469 
00470 void arm_fir_f32(
00471 const arm_fir_instance_f32 * S,
00472 float32_t * pSrc,
00473 float32_t * pDst,
00474 uint32_t blockSize)
00475 {
00476    float32_t *pState = S->pState;                 /* State pointer */
00477    float32_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
00478    float32_t *pStateCurnt;                        /* Points to the current sample of the state */
00479    float32_t *px, *pb;                            /* Temporary pointers for state and coefficient buffers */
00480    uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
00481    uint32_t i, tapCnt, blkCnt;                    /* Loop counters */
00482 
00483    /* Run the below code for Cortex-M0 */
00484 
00485    float32_t acc;
00486 
00487    /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
00488    /* pStateCurnt points to the location where the new input data should be written */
00489    pStateCurnt = &(S->pState[(numTaps - 1U)]);
00490 
00491    /* Initialize blkCnt with blockSize */
00492    blkCnt = blockSize;
00493 
00494    while (blkCnt > 0U)
00495    {
00496       /* Copy one sample at a time into state buffer */
00497       *pStateCurnt++ = *pSrc++;
00498 
00499       /* Set the accumulator to zero */
00500       acc = 0.0f;
00501 
00502       /* Initialize state pointer */
00503       px = pState;
00504 
00505       /* Initialize Coefficient pointer */
00506       pb = pCoeffs;
00507 
00508       i = numTaps;
00509 
00510       /* Perform the multiply-accumulates */
00511       do
00512       {
00513          /* acc =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
00514          acc += *px++ * *pb++;
00515          i--;
00516 
00517       } while (i > 0U);
00518 
00519       /* The result is store in the destination buffer. */
00520       *pDst++ = acc;
00521 
00522       /* Advance state pointer by 1 for the next sample */
00523       pState = pState + 1;
00524 
00525       blkCnt--;
00526    }
00527 
00528    /* Processing is complete.
00529    ** Now copy the last numTaps - 1 samples to the starting of the state buffer.
00530    ** This prepares the state buffer for the next function call. */
00531 
00532    /* Points to the start of the state buffer */
00533    pStateCurnt = S->pState;
00534 
00535    /* Copy numTaps number of values */
00536    tapCnt = numTaps - 1U;
00537 
00538    /* Copy data */
00539    while (tapCnt > 0U)
00540    {
00541       *pStateCurnt++ = *pState++;
00542 
00543       /* Decrement the loop counter */
00544       tapCnt--;
00545    }
00546 
00547 }
00548 
00549 #else
00550 
00551 /* Run the below code for Cortex-M4 and Cortex-M3 */
00552 
00553 void arm_fir_f32(
00554 const arm_fir_instance_f32 * S,
00555 float32_t * pSrc,
00556 float32_t * pDst,
00557 uint32_t blockSize)
00558 {
00559    float32_t *pState = S->pState;                 /* State pointer */
00560    float32_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
00561    float32_t *pStateCurnt;                        /* Points to the current sample of the state */
00562    float32_t *px, *pb;                            /* Temporary pointers for state and coefficient buffers */
00563    float32_t acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;     /* Accumulators */
00564    float32_t x0, x1, x2, x3, x4, x5, x6, x7, c0;  /* Temporary variables to hold state and coefficient values */
00565    uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
00566    uint32_t i, tapCnt, blkCnt;                    /* Loop counters */
00567    float32_t p0,p1,p2,p3,p4,p5,p6,p7;             /* Temporary product values */
00568 
00569    /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
00570    /* pStateCurnt points to the location where the new input data should be written */
00571    pStateCurnt = &(S->pState[(numTaps - 1U)]);
00572 
00573    /* Apply loop unrolling and compute 8 output values simultaneously.
00574     * The variables acc0 ... acc7 hold output values that are being computed:
00575     *
00576     *    acc0 =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
00577     *    acc1 =  b[numTaps-1] * x[n-numTaps] +   b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
00578     *    acc2 =  b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] +   b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
00579     *    acc3 =  b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps]   +...+ b[0] * x[3]
00580     */
00581    blkCnt = blockSize >> 3;
00582 
00583    /* First part of the processing with loop unrolling.  Compute 8 outputs at a time.
00584    ** a second loop below computes the remaining 1 to 7 samples. */
00585    while (blkCnt > 0U)
00586    {
00587       /* Copy four new input samples into the state buffer */
00588       *pStateCurnt++ = *pSrc++;
00589       *pStateCurnt++ = *pSrc++;
00590       *pStateCurnt++ = *pSrc++;
00591       *pStateCurnt++ = *pSrc++;
00592 
00593       /* Set all accumulators to zero */
00594       acc0 = 0.0f;
00595       acc1 = 0.0f;
00596       acc2 = 0.0f;
00597       acc3 = 0.0f;
00598       acc4 = 0.0f;
00599       acc5 = 0.0f;
00600       acc6 = 0.0f;
00601       acc7 = 0.0f;
00602 
00603       /* Initialize state pointer */
00604       px = pState;
00605 
00606       /* Initialize coeff pointer */
00607       pb = (pCoeffs);
00608 
00609       /* This is separated from the others to avoid
00610        * a call to __aeabi_memmove which would be slower
00611        */
00612       *pStateCurnt++ = *pSrc++;
00613       *pStateCurnt++ = *pSrc++;
00614       *pStateCurnt++ = *pSrc++;
00615       *pStateCurnt++ = *pSrc++;
00616 
00617       /* Read the first seven samples from the state buffer:  x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */
00618       x0 = *px++;
00619       x1 = *px++;
00620       x2 = *px++;
00621       x3 = *px++;
00622       x4 = *px++;
00623       x5 = *px++;
00624       x6 = *px++;
00625 
00626       /* Loop unrolling.  Process 8 taps at a time. */
00627       tapCnt = numTaps >> 3U;
00628 
00629       /* Loop over the number of taps.  Unroll by a factor of 8.
00630        ** Repeat until we've computed numTaps-8 coefficients. */
00631       while (tapCnt > 0U)
00632       {
00633          /* Read the b[numTaps-1] coefficient */
00634          c0 = *(pb++);
00635 
00636          /* Read x[n-numTaps-3] sample */
00637          x7 = *(px++);
00638 
00639          /* acc0 +=  b[numTaps-1] * x[n-numTaps] */
00640          p0 = x0 * c0;
00641 
00642          /* acc1 +=  b[numTaps-1] * x[n-numTaps-1] */
00643          p1 = x1 * c0;
00644 
00645          /* acc2 +=  b[numTaps-1] * x[n-numTaps-2] */
00646          p2 = x2 * c0;
00647 
00648          /* acc3 +=  b[numTaps-1] * x[n-numTaps-3] */
00649          p3 = x3 * c0;
00650 
00651          /* acc4 +=  b[numTaps-1] * x[n-numTaps-4] */
00652          p4 = x4 * c0;
00653 
00654          /* acc1 +=  b[numTaps-1] * x[n-numTaps-5] */
00655          p5 = x5 * c0;
00656 
00657          /* acc2 +=  b[numTaps-1] * x[n-numTaps-6] */
00658          p6 = x6 * c0;
00659 
00660          /* acc3 +=  b[numTaps-1] * x[n-numTaps-7] */
00661          p7 = x7 * c0;
00662 
00663          /* Read the b[numTaps-2] coefficient */
00664          c0 = *(pb++);
00665 
00666          /* Read x[n-numTaps-4] sample */
00667          x0 = *(px++);
00668 
00669          acc0 += p0;
00670          acc1 += p1;
00671          acc2 += p2;
00672          acc3 += p3;
00673          acc4 += p4;
00674          acc5 += p5;
00675          acc6 += p6;
00676          acc7 += p7;
00677 
00678 
00679          /* Perform the multiply-accumulate */
00680          p0 = x1 * c0;
00681          p1 = x2 * c0;
00682          p2 = x3 * c0;
00683          p3 = x4 * c0;
00684          p4 = x5 * c0;
00685          p5 = x6 * c0;
00686          p6 = x7 * c0;
00687          p7 = x0 * c0;
00688 
00689          /* Read the b[numTaps-3] coefficient */
00690          c0 = *(pb++);
00691 
00692          /* Read x[n-numTaps-5] sample */
00693          x1 = *(px++);
00694 
00695          acc0 += p0;
00696          acc1 += p1;
00697          acc2 += p2;
00698          acc3 += p3;
00699          acc4 += p4;
00700          acc5 += p5;
00701          acc6 += p6;
00702          acc7 += p7;
00703 
00704          /* Perform the multiply-accumulates */
00705          p0 = x2 * c0;
00706          p1 = x3 * c0;
00707          p2 = x4 * c0;
00708          p3 = x5 * c0;
00709          p4 = x6 * c0;
00710          p5 = x7 * c0;
00711          p6 = x0 * c0;
00712          p7 = x1 * c0;
00713 
00714          /* Read the b[numTaps-4] coefficient */
00715          c0 = *(pb++);
00716 
00717          /* Read x[n-numTaps-6] sample */
00718          x2 = *(px++);
00719 
00720          acc0 += p0;
00721          acc1 += p1;
00722          acc2 += p2;
00723          acc3 += p3;
00724          acc4 += p4;
00725          acc5 += p5;
00726          acc6 += p6;
00727          acc7 += p7;
00728 
00729          /* Perform the multiply-accumulates */
00730          p0 = x3 * c0;
00731          p1 = x4 * c0;
00732          p2 = x5 * c0;
00733          p3 = x6 * c0;
00734          p4 = x7 * c0;
00735          p5 = x0 * c0;
00736          p6 = x1 * c0;
00737          p7 = x2 * c0;
00738 
00739          /* Read the b[numTaps-4] coefficient */
00740          c0 = *(pb++);
00741 
00742          /* Read x[n-numTaps-6] sample */
00743          x3 = *(px++);
00744 
00745          acc0 += p0;
00746          acc1 += p1;
00747          acc2 += p2;
00748          acc3 += p3;
00749          acc4 += p4;
00750          acc5 += p5;
00751          acc6 += p6;
00752          acc7 += p7;
00753 
00754          /* Perform the multiply-accumulates */
00755          p0 = x4 * c0;
00756          p1 = x5 * c0;
00757          p2 = x6 * c0;
00758          p3 = x7 * c0;
00759          p4 = x0 * c0;
00760          p5 = x1 * c0;
00761          p6 = x2 * c0;
00762          p7 = x3 * c0;
00763 
00764          /* Read the b[numTaps-4] coefficient */
00765          c0 = *(pb++);
00766 
00767          /* Read x[n-numTaps-6] sample */
00768          x4 = *(px++);
00769 
00770          acc0 += p0;
00771          acc1 += p1;
00772          acc2 += p2;
00773          acc3 += p3;
00774          acc4 += p4;
00775          acc5 += p5;
00776          acc6 += p6;
00777          acc7 += p7;
00778 
00779          /* Perform the multiply-accumulates */
00780          p0 = x5 * c0;
00781          p1 = x6 * c0;
00782          p2 = x7 * c0;
00783          p3 = x0 * c0;
00784          p4 = x1 * c0;
00785          p5 = x2 * c0;
00786          p6 = x3 * c0;
00787          p7 = x4 * c0;
00788 
00789          /* Read the b[numTaps-4] coefficient */
00790          c0 = *(pb++);
00791 
00792          /* Read x[n-numTaps-6] sample */
00793          x5 = *(px++);
00794 
00795          acc0 += p0;
00796          acc1 += p1;
00797          acc2 += p2;
00798          acc3 += p3;
00799          acc4 += p4;
00800          acc5 += p5;
00801          acc6 += p6;
00802          acc7 += p7;
00803 
00804          /* Perform the multiply-accumulates */
00805          p0 = x6 * c0;
00806          p1 = x7 * c0;
00807          p2 = x0 * c0;
00808          p3 = x1 * c0;
00809          p4 = x2 * c0;
00810          p5 = x3 * c0;
00811          p6 = x4 * c0;
00812          p7 = x5 * c0;
00813 
00814          /* Read the b[numTaps-4] coefficient */
00815          c0 = *(pb++);
00816 
00817          /* Read x[n-numTaps-6] sample */
00818          x6 = *(px++);
00819 
00820          acc0 += p0;
00821          acc1 += p1;
00822          acc2 += p2;
00823          acc3 += p3;
00824          acc4 += p4;
00825          acc5 += p5;
00826          acc6 += p6;
00827          acc7 += p7;
00828 
00829          /* Perform the multiply-accumulates */
00830          p0 = x7 * c0;
00831          p1 = x0 * c0;
00832          p2 = x1 * c0;
00833          p3 = x2 * c0;
00834          p4 = x3 * c0;
00835          p5 = x4 * c0;
00836          p6 = x5 * c0;
00837          p7 = x6 * c0;
00838 
00839          tapCnt--;
00840 
00841          acc0 += p0;
00842          acc1 += p1;
00843          acc2 += p2;
00844          acc3 += p3;
00845          acc4 += p4;
00846          acc5 += p5;
00847          acc6 += p6;
00848          acc7 += p7;
00849       }
00850 
00851       /* If the filter length is not a multiple of 8, compute the remaining filter taps */
00852       tapCnt = numTaps % 0x8U;
00853 
00854       while (tapCnt > 0U)
00855       {
00856          /* Read coefficients */
00857          c0 = *(pb++);
00858 
00859          /* Fetch 1 state variable */
00860          x7 = *(px++);
00861 
00862          /* Perform the multiply-accumulates */
00863          p0 = x0 * c0;
00864          p1 = x1 * c0;
00865          p2 = x2 * c0;
00866          p3 = x3 * c0;
00867          p4 = x4 * c0;
00868          p5 = x5 * c0;
00869          p6 = x6 * c0;
00870          p7 = x7 * c0;
00871 
00872          /* Reuse the present sample states for next sample */
00873          x0 = x1;
00874          x1 = x2;
00875          x2 = x3;
00876          x3 = x4;
00877          x4 = x5;
00878          x5 = x6;
00879          x6 = x7;
00880 
00881          acc0 += p0;
00882          acc1 += p1;
00883          acc2 += p2;
00884          acc3 += p3;
00885          acc4 += p4;
00886          acc5 += p5;
00887          acc6 += p6;
00888          acc7 += p7;
00889 
00890          /* Decrement the loop counter */
00891          tapCnt--;
00892       }
00893 
00894       /* Advance the state pointer by 8 to process the next group of 8 samples */
00895       pState = pState + 8;
00896 
00897       /* The results in the 8 accumulators, store in the destination buffer. */
00898       *pDst++ = acc0;
00899       *pDst++ = acc1;
00900       *pDst++ = acc2;
00901       *pDst++ = acc3;
00902       *pDst++ = acc4;
00903       *pDst++ = acc5;
00904       *pDst++ = acc6;
00905       *pDst++ = acc7;
00906 
00907       blkCnt--;
00908    }
00909 
00910    /* If the blockSize is not a multiple of 8, compute any remaining output samples here.
00911    ** No loop unrolling is used. */
00912    blkCnt = blockSize % 0x8U;
00913 
00914    while (blkCnt > 0U)
00915    {
00916       /* Copy one sample at a time into state buffer */
00917       *pStateCurnt++ = *pSrc++;
00918 
00919       /* Set the accumulator to zero */
00920       acc0 = 0.0f;
00921 
00922       /* Initialize state pointer */
00923       px = pState;
00924 
00925       /* Initialize Coefficient pointer */
00926       pb = (pCoeffs);
00927 
00928       i = numTaps;
00929 
00930       /* Perform the multiply-accumulates */
00931       do
00932       {
00933          acc0 += *px++ * *pb++;
00934          i--;
00935 
00936       } while (i > 0U);
00937 
00938       /* The result is store in the destination buffer. */
00939       *pDst++ = acc0;
00940 
00941       /* Advance state pointer by 1 for the next sample */
00942       pState = pState + 1;
00943 
00944       blkCnt--;
00945    }
00946 
00947    /* Processing is complete.
00948    ** Now copy the last numTaps - 1 samples to the start of the state buffer.
00949    ** This prepares the state buffer for the next function call. */
00950 
00951    /* Points to the start of the state buffer */
00952    pStateCurnt = S->pState;
00953 
00954    tapCnt = (numTaps - 1U) >> 2U;
00955 
00956    /* copy data */
00957    while (tapCnt > 0U)
00958    {
00959       *pStateCurnt++ = *pState++;
00960       *pStateCurnt++ = *pState++;
00961       *pStateCurnt++ = *pState++;
00962       *pStateCurnt++ = *pState++;
00963 
00964       /* Decrement the loop counter */
00965       tapCnt--;
00966    }
00967 
00968    /* Calculate remaining number of copies */
00969    tapCnt = (numTaps - 1U) % 0x4U;
00970 
00971    /* Copy the remaining q31_t data */
00972    while (tapCnt > 0U)
00973    {
00974       *pStateCurnt++ = *pState++;
00975 
00976       /* Decrement the loop counter */
00977       tapCnt--;
00978    }
00979 }
00980 
00981 #endif
00982 
00983 /**
00984 * @} end of FIR group
00985 */
00986