Aded CMSIS5 DSP and NN folder. Needs some work

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_fir_fast_q15.c Source File

arm_fir_fast_q15.c

00001 /* ----------------------------------------------------------------------
00002  * Project:      CMSIS DSP Library
00003  * Title:        arm_fir_fast_q15.c
00004  * Description:  Q15 Fast FIR filter processing function
00005  *
00006  * $Date:        27. January 2017
00007  * $Revision:    V.1.5.1
00008  *
00009  * Target Processor: Cortex-M cores
00010  * -------------------------------------------------------------------- */
00011 /*
00012  * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
00013  *
00014  * SPDX-License-Identifier: Apache-2.0
00015  *
00016  * Licensed under the Apache License, Version 2.0 (the License); you may
00017  * not use this file except in compliance with the License.
00018  * You may obtain a copy of the License at
00019  *
00020  * www.apache.org/licenses/LICENSE-2.0
00021  *
00022  * Unless required by applicable law or agreed to in writing, software
00023  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
00024  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00025  * See the License for the specific language governing permissions and
00026  * limitations under the License.
00027  */
00028 
00029 #include "arm_math.h"
00030 
00031 /**
00032  * @ingroup groupFilters
00033  */
00034 
00035 /**
00036  * @addtogroup FIR
00037  * @{
00038  */
00039 
00040 /**
00041  * @param[in] *S points to an instance of the Q15 FIR filter structure.
00042  * @param[in] *pSrc points to the block of input data.
00043  * @param[out] *pDst points to the block of output data.
00044  * @param[in] blockSize number of samples to process per call.
00045  * @return none.
00046  *
00047  * <b>Scaling and Overflow Behavior:</b>
00048  * \par
00049  * This fast version uses a 32-bit accumulator with 2.30 format.
00050  * The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit.
00051  * Thus, if the accumulator result overflows it wraps around and distorts the result.
00052  * In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits.
00053  * The 2.30 accumulator is then truncated to 2.15 format and saturated to yield the 1.15 result.
00054  *
00055  * \par
00056  * Refer to the function <code>arm_fir_q15()</code> for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion.  Both the slow and the fast versions use the same instance structure.
00057  * Use the function <code>arm_fir_init_q15()</code> to initialize the filter structure.
00058  */
00059 
00060 void arm_fir_fast_q15(
00061   const arm_fir_instance_q15 * S,
00062   q15_t * pSrc,
00063   q15_t * pDst,
00064   uint32_t blockSize)
00065 {
00066   q15_t *pState = S->pState;                     /* State pointer */
00067   q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
00068   q15_t *pStateCurnt;                            /* Points to the current sample of the state */
00069   q31_t acc0, acc1, acc2, acc3;                  /* Accumulators */
00070   q15_t *pb;                                     /* Temporary pointer for coefficient buffer */
00071   q15_t *px;                                     /* Temporary q31 pointer for SIMD state buffer accesses */
00072   q31_t x0, x1, x2, c0;                          /* Temporary variables to hold SIMD state and coefficient values */
00073   uint32_t numTaps = S->numTaps;                 /* Number of taps in the filter */
00074   uint32_t tapCnt, blkCnt;                       /* Loop counters */
00075 
00076 
00077   /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
00078   /* pStateCurnt points to the location where the new input data should be written */
00079   pStateCurnt = &(S->pState[(numTaps - 1U)]);
00080 
00081   /* Apply loop unrolling and compute 4 output values simultaneously.
00082    * The variables acc0 ... acc3 hold output values that are being computed:
00083    *
00084    *    acc0 =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
00085    *    acc1 =  b[numTaps-1] * x[n-numTaps] +   b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
00086    *    acc2 =  b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] +   b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
00087    *    acc3 =  b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps]   +...+ b[0] * x[3]
00088    */
00089 
00090   blkCnt = blockSize >> 2;
00091 
00092   /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
00093    ** a second loop below computes the remaining 1 to 3 samples. */
00094   while (blkCnt > 0U)
00095   {
00096     /* Copy four new input samples into the state buffer.
00097      ** Use 32-bit SIMD to move the 16-bit data.  Only requires two copies. */
00098     *pStateCurnt++ = *pSrc++;
00099     *pStateCurnt++ = *pSrc++;
00100     *pStateCurnt++ = *pSrc++;
00101     *pStateCurnt++ = *pSrc++;
00102 
00103 
00104     /* Set all accumulators to zero */
00105     acc0 = 0;
00106     acc1 = 0;
00107     acc2 = 0;
00108     acc3 = 0;
00109 
00110     /* Typecast q15_t pointer to q31_t pointer for state reading in q31_t */
00111     px = pState;
00112 
00113     /* Typecast q15_t pointer to q31_t pointer for coefficient reading in q31_t */
00114     pb = pCoeffs;
00115 
00116     /* Read the first two samples from the state buffer:  x[n-N], x[n-N-1] */
00117     x0 = *__SIMD32(px)++;
00118 
00119     /* Read the third and forth samples from the state buffer: x[n-N-2], x[n-N-3] */
00120     x2 = *__SIMD32(px)++;
00121 
00122     /* Loop over the number of taps.  Unroll by a factor of 4.
00123      ** Repeat until we've computed numTaps-(numTaps%4) coefficients. */
00124     tapCnt = numTaps >> 2;
00125 
00126     while (tapCnt > 0)
00127     {
00128       /* Read the first two coefficients using SIMD:  b[N] and b[N-1] coefficients */
00129       c0 = *__SIMD32(pb)++;
00130 
00131       /* acc0 +=  b[N] * x[n-N] + b[N-1] * x[n-N-1] */
00132       acc0 = __SMLAD(x0, c0, acc0);
00133 
00134       /* acc2 +=  b[N] * x[n-N-2] + b[N-1] * x[n-N-3] */
00135       acc2 = __SMLAD(x2, c0, acc2);
00136 
00137       /* pack  x[n-N-1] and x[n-N-2] */
00138 #ifndef ARM_MATH_BIG_ENDIAN
00139       x1 = __PKHBT(x2, x0, 0);
00140 #else
00141       x1 = __PKHBT(x0, x2, 0);
00142 #endif
00143 
00144       /* Read state x[n-N-4], x[n-N-5] */
00145       x0 = _SIMD32_OFFSET(px);
00146 
00147       /* acc1 +=  b[N] * x[n-N-1] + b[N-1] * x[n-N-2] */
00148       acc1 = __SMLADX(x1, c0, acc1);
00149 
00150       /* pack  x[n-N-3] and x[n-N-4] */
00151 #ifndef ARM_MATH_BIG_ENDIAN
00152       x1 = __PKHBT(x0, x2, 0);
00153 #else
00154       x1 = __PKHBT(x2, x0, 0);
00155 #endif
00156 
00157       /* acc3 +=  b[N] * x[n-N-3] + b[N-1] * x[n-N-4] */
00158       acc3 = __SMLADX(x1, c0, acc3);
00159 
00160       /* Read coefficients b[N-2], b[N-3] */
00161       c0 = *__SIMD32(pb)++;
00162 
00163       /* acc0 +=  b[N-2] * x[n-N-2] + b[N-3] * x[n-N-3] */
00164       acc0 = __SMLAD(x2, c0, acc0);
00165 
00166       /* Read state x[n-N-6], x[n-N-7] with offset */
00167       x2 = _SIMD32_OFFSET(px + 2U);
00168 
00169       /* acc2 +=  b[N-2] * x[n-N-4] + b[N-3] * x[n-N-5] */
00170       acc2 = __SMLAD(x0, c0, acc2);
00171 
00172       /* acc1 +=  b[N-2] * x[n-N-3] + b[N-3] * x[n-N-4] */
00173       acc1 = __SMLADX(x1, c0, acc1);
00174 
00175       /* pack  x[n-N-5] and x[n-N-6] */
00176 #ifndef ARM_MATH_BIG_ENDIAN
00177       x1 = __PKHBT(x2, x0, 0);
00178 #else
00179       x1 = __PKHBT(x0, x2, 0);
00180 #endif
00181 
00182       /* acc3 +=  b[N-2] * x[n-N-5] + b[N-3] * x[n-N-6] */
00183       acc3 = __SMLADX(x1, c0, acc3);
00184 
00185       /* Update state pointer for next state reading */
00186       px += 4U;
00187 
00188       /* Decrement tap count */
00189       tapCnt--;
00190 
00191     }
00192 
00193     /* If the filter length is not a multiple of 4, compute the remaining filter taps.
00194      ** This is always be 2 taps since the filter length is even. */
00195     if ((numTaps & 0x3U) != 0U)
00196     {
00197 
00198       /* Read last two coefficients */
00199       c0 = *__SIMD32(pb)++;
00200 
00201       /* Perform the multiply-accumulates */
00202       acc0 = __SMLAD(x0, c0, acc0);
00203       acc2 = __SMLAD(x2, c0, acc2);
00204 
00205       /* pack state variables */
00206 #ifndef ARM_MATH_BIG_ENDIAN
00207       x1 = __PKHBT(x2, x0, 0);
00208 #else
00209       x1 = __PKHBT(x0, x2, 0);
00210 #endif
00211 
00212       /* Read last state variables */
00213       x0 = *__SIMD32(px);
00214 
00215       /* Perform the multiply-accumulates */
00216       acc1 = __SMLADX(x1, c0, acc1);
00217 
00218       /* pack state variables */
00219 #ifndef ARM_MATH_BIG_ENDIAN
00220       x1 = __PKHBT(x0, x2, 0);
00221 #else
00222       x1 = __PKHBT(x2, x0, 0);
00223 #endif
00224 
00225       /* Perform the multiply-accumulates */
00226       acc3 = __SMLADX(x1, c0, acc3);
00227     }
00228 
00229     /* The results in the 4 accumulators are in 2.30 format.  Convert to 1.15 with saturation.
00230      ** Then store the 4 outputs in the destination buffer. */
00231 
00232 #ifndef ARM_MATH_BIG_ENDIAN
00233 
00234     *__SIMD32(pDst)++ =
00235       __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
00236 
00237     *__SIMD32(pDst)++ =
00238       __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
00239 
00240 #else
00241 
00242     *__SIMD32(pDst)++ =
00243       __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
00244 
00245     *__SIMD32(pDst)++ =
00246       __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
00247 
00248 
00249 #endif /*      #ifndef ARM_MATH_BIG_ENDIAN       */
00250 
00251     /* Advance the state pointer by 4 to process the next group of 4 samples */
00252     pState = pState + 4U;
00253 
00254     /* Decrement the loop counter */
00255     blkCnt--;
00256   }
00257 
00258   /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
00259    ** No loop unrolling is used. */
00260   blkCnt = blockSize % 0x4U;
00261   while (blkCnt > 0U)
00262   {
00263     /* Copy two samples into state buffer */
00264     *pStateCurnt++ = *pSrc++;
00265 
00266     /* Set the accumulator to zero */
00267     acc0 = 0;
00268 
00269     /* Use SIMD to hold states and coefficients */
00270     px = pState;
00271     pb = pCoeffs;
00272 
00273     tapCnt = numTaps >> 1U;
00274 
00275     do
00276     {
00277 
00278       acc0 += (q31_t) * px++ * *pb++;
00279       acc0 += (q31_t) * px++ * *pb++;
00280 
00281       tapCnt--;
00282     }
00283     while (tapCnt > 0U);
00284 
00285     /* The result is in 2.30 format.  Convert to 1.15 with saturation.
00286      ** Then store the output in the destination buffer. */
00287     *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
00288 
00289     /* Advance state pointer by 1 for the next sample */
00290     pState = pState + 1U;
00291 
00292     /* Decrement the loop counter */
00293     blkCnt--;
00294   }
00295 
00296   /* Processing is complete.
00297    ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
00298    ** This prepares the state buffer for the next function call. */
00299 
00300   /* Points to the start of the state buffer */
00301   pStateCurnt = S->pState;
00302 
00303   /* Calculation of count for copying integer writes */
00304   tapCnt = (numTaps - 1U) >> 2;
00305 
00306   while (tapCnt > 0U)
00307   {
00308     *pStateCurnt++ = *pState++;
00309     *pStateCurnt++ = *pState++;
00310     *pStateCurnt++ = *pState++;
00311     *pStateCurnt++ = *pState++;
00312 
00313     tapCnt--;
00314 
00315   }
00316 
00317   /* Calculation of count for remaining q15_t data */
00318   tapCnt = (numTaps - 1U) % 0x4U;
00319 
00320   /* copy remaining data */
00321   while (tapCnt > 0U)
00322   {
00323     *pStateCurnt++ = *pState++;
00324 
00325     /* Decrement the loop counter */
00326     tapCnt--;
00327   }
00328 
00329 }
00330 
00331 /**
00332  * @} end of FIR group
00333  */
00334