CMSIS DSP library

Dependents:   KL25Z_FFT_Demo Hat_Board_v5_1 KL25Z_FFT_Demo_tony KL25Z_FFT_Demo_tony ... more

Fork of mbed-dsp by mbed official

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_fir_q15.c Source File

arm_fir_q15.c

00001 /* ----------------------------------------------------------------------    
00002 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.    
00003 *    
00004 * $Date:        17. January 2013
00005 * $Revision:    V1.4.1
00006 *    
00007 * Project:      CMSIS DSP Library    
00008 * Title:        arm_fir_q15.c    
00009 *    
00010 * Description:  Q15 FIR filter processing function.    
00011 *    
00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
00013 *  
00014 * Redistribution and use in source and binary forms, with or without 
00015 * modification, are permitted provided that the following conditions
00016 * are met:
00017 *   - Redistributions of source code must retain the above copyright
00018 *     notice, this list of conditions and the following disclaimer.
00019 *   - Redistributions in binary form must reproduce the above copyright
00020 *     notice, this list of conditions and the following disclaimer in
00021 *     the documentation and/or other materials provided with the 
00022 *     distribution.
00023 *   - Neither the name of ARM LIMITED nor the names of its contributors
00024 *     may be used to endorse or promote products derived from this
00025 *     software without specific prior written permission.
00026 *
00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00038 * POSSIBILITY OF SUCH DAMAGE.   
00039 * -------------------------------------------------------------------- */
00040 
00041 #include "arm_math.h"
00042 
00043 /**       
00044  * @ingroup groupFilters       
00045  */
00046 
00047 /**       
00048  * @addtogroup FIR       
00049  * @{       
00050  */
00051 
00052 /**       
00053  * @brief Processing function for the Q15 FIR filter.       
00054  * @param[in] *S points to an instance of the Q15 FIR structure.       
00055  * @param[in] *pSrc points to the block of input data.       
00056  * @param[out] *pDst points to the block of output data.       
00057  * @param[in]  blockSize number of samples to process per call.       
00058  * @return none.       
00059  *   
00060  *   
00061  * \par Restrictions   
00062  *  If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE   
00063  *  In this case input, output, state buffers should be aligned by 32-bit   
00064  *   
00065  * <b>Scaling and Overflow Behavior:</b>       
00066  * \par       
00067  * The function is implemented using a 64-bit internal accumulator.       
00068  * Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result.       
00069  * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.       
00070  * There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.       
00071  * After all additions have been performed, the accumulator is truncated to 34.15 format by discarding low 15 bits.       
00072  * Lastly, the accumulator is saturated to yield a result in 1.15 format.       
00073  *       
00074  * \par       
00075  * Refer to the function <code>arm_fir_fast_q15()</code> for a faster but less precise implementation of this function.       
00076  */
00077 
00078 #ifndef ARM_MATH_CM0_FAMILY
00079 
00080 /* Run the below code for Cortex-M4 and Cortex-M3 */
00081 
00082 #ifndef UNALIGNED_SUPPORT_DISABLE
00083 
00084 
00085 void arm_fir_q15(
00086   const arm_fir_instance_q15 * S,
00087   q15_t * pSrc,
00088   q15_t * pDst,
00089   uint32_t blockSize)
00090 {
00091   q15_t *pState = S->pState;                     /* State pointer */
00092   q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
00093   q15_t *pStateCurnt;                            /* Points to the current sample of the state */
00094   q15_t *px1;                                    /* Temporary q15 pointer for state buffer */
00095   q15_t *pb;                                     /* Temporary pointer for coefficient buffer */
00096   q31_t x0, x1, x2, x3, c0;                      /* Temporary variables to hold SIMD state and coefficient values */
00097   q63_t acc0, acc1, acc2, acc3;                  /* Accumulators */
00098   uint32_t numTaps = S->numTaps;                 /* Number of taps in the filter */
00099   uint32_t tapCnt, blkCnt;                       /* Loop counters */
00100 
00101 
00102   /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
00103   /* pStateCurnt points to the location where the new input data should be written */
00104   pStateCurnt = &(S->pState[(numTaps - 1u)]);
00105 
00106   /* Apply loop unrolling and compute 4 output values simultaneously.       
00107    * The variables acc0 ... acc3 hold output values that are being computed:       
00108    *       
00109    *    acc0 =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]       
00110    *    acc1 =  b[numTaps-1] * x[n-numTaps] +   b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]       
00111    *    acc2 =  b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] +   b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]       
00112    *    acc3 =  b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps]   +...+ b[0] * x[3]       
00113    */
00114 
00115   blkCnt = blockSize >> 2;
00116 
00117   /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.       
00118    ** a second loop below computes the remaining 1 to 3 samples. */
00119   while(blkCnt > 0u)
00120   {
00121     /* Copy four new input samples into the state buffer.       
00122      ** Use 32-bit SIMD to move the 16-bit data.  Only requires two copies. */
00123     *__SIMD32(pStateCurnt)++ = *__SIMD32(pSrc)++;
00124     *__SIMD32(pStateCurnt)++ = *__SIMD32(pSrc)++;
00125 
00126     /* Set all accumulators to zero */
00127     acc0 = 0;
00128     acc1 = 0;
00129     acc2 = 0;
00130     acc3 = 0;
00131 
00132     /* Initialize state pointer of type q15 */
00133     px1 = pState;
00134 
00135     /* Initialize coeff pointer of type q31 */
00136     pb = pCoeffs;
00137 
00138     /* Read the first two samples from the state buffer:  x[n-N], x[n-N-1] */
00139     x0 = _SIMD32_OFFSET(px1);
00140 
00141     /* Read the third and forth samples from the state buffer: x[n-N-1], x[n-N-2] */
00142     x1 = _SIMD32_OFFSET(px1 + 1u);
00143 
00144     px1 += 2u;
00145 
00146     /* Loop over the number of taps.  Unroll by a factor of 4.       
00147      ** Repeat until we've computed numTaps-4 coefficients. */
00148     tapCnt = numTaps >> 2;
00149 
00150     while(tapCnt > 0u)
00151     {
00152       /* Read the first two coefficients using SIMD:  b[N] and b[N-1] coefficients */
00153       c0 = *__SIMD32(pb)++;
00154 
00155       /* acc0 +=  b[N] * x[n-N] + b[N-1] * x[n-N-1] */
00156       acc0 = __SMLALD(x0, c0, acc0);
00157 
00158       /* acc1 +=  b[N] * x[n-N-1] + b[N-1] * x[n-N-2] */
00159       acc1 = __SMLALD(x1, c0, acc1);
00160 
00161       /* Read state x[n-N-2], x[n-N-3] */
00162       x2 = _SIMD32_OFFSET(px1);
00163 
00164       /* Read state x[n-N-3], x[n-N-4] */
00165       x3 = _SIMD32_OFFSET(px1 + 1u);
00166 
00167       /* acc2 +=  b[N] * x[n-N-2] + b[N-1] * x[n-N-3] */
00168       acc2 = __SMLALD(x2, c0, acc2);
00169 
00170       /* acc3 +=  b[N] * x[n-N-3] + b[N-1] * x[n-N-4] */
00171       acc3 = __SMLALD(x3, c0, acc3);
00172 
00173       /* Read coefficients b[N-2], b[N-3] */
00174       c0 = *__SIMD32(pb)++;
00175 
00176       /* acc0 +=  b[N-2] * x[n-N-2] + b[N-3] * x[n-N-3] */
00177       acc0 = __SMLALD(x2, c0, acc0);
00178 
00179       /* acc1 +=  b[N-2] * x[n-N-3] + b[N-3] * x[n-N-4] */
00180       acc1 = __SMLALD(x3, c0, acc1);
00181 
00182       /* Read state x[n-N-4], x[n-N-5] */
00183       x0 = _SIMD32_OFFSET(px1 + 2u);
00184 
00185       /* Read state x[n-N-5], x[n-N-6] */
00186       x1 = _SIMD32_OFFSET(px1 + 3u);
00187 
00188       /* acc2 +=  b[N-2] * x[n-N-4] + b[N-3] * x[n-N-5] */
00189       acc2 = __SMLALD(x0, c0, acc2);
00190 
00191       /* acc3 +=  b[N-2] * x[n-N-5] + b[N-3] * x[n-N-6] */
00192       acc3 = __SMLALD(x1, c0, acc3);
00193 
00194       px1 += 4u;
00195 
00196       tapCnt--;
00197 
00198     }
00199 
00200 
00201     /* If the filter length is not a multiple of 4, compute the remaining filter taps.       
00202      ** This is always be 2 taps since the filter length is even. */
00203     if((numTaps & 0x3u) != 0u)
00204     {
00205       /* Read 2 coefficients */
00206       c0 = *__SIMD32(pb)++;
00207 
00208       /* Fetch 4 state variables */
00209       x2 = _SIMD32_OFFSET(px1);
00210 
00211       x3 = _SIMD32_OFFSET(px1 + 1u);
00212 
00213       /* Perform the multiply-accumulates */
00214       acc0 = __SMLALD(x0, c0, acc0);
00215 
00216       px1 += 2u;
00217 
00218       acc1 = __SMLALD(x1, c0, acc1);
00219       acc2 = __SMLALD(x2, c0, acc2);
00220       acc3 = __SMLALD(x3, c0, acc3);
00221     }
00222 
00223     /* The results in the 4 accumulators are in 2.30 format.  Convert to 1.15 with saturation.       
00224      ** Then store the 4 outputs in the destination buffer. */
00225 
00226 #ifndef ARM_MATH_BIG_ENDIAN
00227 
00228     *__SIMD32(pDst)++ =
00229       __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
00230     *__SIMD32(pDst)++ =
00231       __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
00232 
00233 #else
00234 
00235     *__SIMD32(pDst)++ =
00236       __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
00237     *__SIMD32(pDst)++ =
00238       __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
00239 
00240 #endif /*      #ifndef ARM_MATH_BIG_ENDIAN       */
00241 
00242 
00243 
00244     /* Advance the state pointer by 4 to process the next group of 4 samples */
00245     pState = pState + 4;
00246 
00247     /* Decrement the loop counter */
00248     blkCnt--;
00249   }
00250 
00251   /* If the blockSize is not a multiple of 4, compute any remaining output samples here.       
00252    ** No loop unrolling is used. */
00253   blkCnt = blockSize % 0x4u;
00254   while(blkCnt > 0u)
00255   {
00256     /* Copy two samples into state buffer */
00257     *pStateCurnt++ = *pSrc++;
00258 
00259     /* Set the accumulator to zero */
00260     acc0 = 0;
00261 
00262     /* Initialize state pointer of type q15 */
00263     px1 = pState;
00264 
00265     /* Initialize coeff pointer of type q31 */
00266     pb = pCoeffs;
00267 
00268     tapCnt = numTaps >> 1;
00269 
00270     do
00271     {
00272 
00273       c0 = *__SIMD32(pb)++;
00274       x0 = *__SIMD32(px1)++;
00275 
00276       acc0 = __SMLALD(x0, c0, acc0);
00277       tapCnt--;
00278     }
00279     while(tapCnt > 0u);
00280 
00281     /* The result is in 2.30 format.  Convert to 1.15 with saturation.       
00282      ** Then store the output in the destination buffer. */
00283     *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
00284 
00285     /* Advance state pointer by 1 for the next sample */
00286     pState = pState + 1;
00287 
00288     /* Decrement the loop counter */
00289     blkCnt--;
00290   }
00291 
00292   /* Processing is complete.       
00293    ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.       
00294    ** This prepares the state buffer for the next function call. */
00295 
00296   /* Points to the start of the state buffer */
00297   pStateCurnt = S->pState;
00298 
00299   /* Calculation of count for copying integer writes */
00300   tapCnt = (numTaps - 1u) >> 2;
00301 
00302   while(tapCnt > 0u)
00303   {
00304 
00305     /* Copy state values to start of state buffer */
00306     *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++;
00307     *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++;
00308 
00309     tapCnt--;
00310 
00311   }
00312 
00313   /* Calculation of count for remaining q15_t data */
00314   tapCnt = (numTaps - 1u) % 0x4u;
00315 
00316   /* copy remaining data */
00317   while(tapCnt > 0u)
00318   {
00319     *pStateCurnt++ = *pState++;
00320 
00321     /* Decrement the loop counter */
00322     tapCnt--;
00323   }
00324 }
00325 
00326 #else /* UNALIGNED_SUPPORT_DISABLE */
00327 
00328 void arm_fir_q15(
00329   const arm_fir_instance_q15 * S,
00330   q15_t * pSrc,
00331   q15_t * pDst,
00332   uint32_t blockSize)
00333 {
00334   q15_t *pState = S->pState;                     /* State pointer */
00335   q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
00336   q15_t *pStateCurnt;                            /* Points to the current sample of the state */
00337   q63_t acc0, acc1, acc2, acc3;                  /* Accumulators */
00338   q15_t *pb;                                     /* Temporary pointer for coefficient buffer */
00339   q15_t *px;                                     /* Temporary q31 pointer for SIMD state buffer accesses */
00340   q31_t x0, x1, x2, c0;                          /* Temporary variables to hold SIMD state and coefficient values */
00341   uint32_t numTaps = S->numTaps;                 /* Number of taps in the filter */
00342   uint32_t tapCnt, blkCnt;                       /* Loop counters */
00343 
00344 
00345   /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
00346   /* pStateCurnt points to the location where the new input data should be written */
00347   pStateCurnt = &(S->pState[(numTaps - 1u)]);
00348 
00349   /* Apply loop unrolling and compute 4 output values simultaneously.      
00350    * The variables acc0 ... acc3 hold output values that are being computed:      
00351    *      
00352    *    acc0 =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]      
00353    *    acc1 =  b[numTaps-1] * x[n-numTaps] +   b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]      
00354    *    acc2 =  b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] +   b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]      
00355    *    acc3 =  b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps]   +...+ b[0] * x[3]      
00356    */
00357 
00358   blkCnt = blockSize >> 2;
00359 
00360   /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.      
00361    ** a second loop below computes the remaining 1 to 3 samples. */
00362   while(blkCnt > 0u)
00363   {
00364     /* Copy four new input samples into the state buffer.      
00365      ** Use 32-bit SIMD to move the 16-bit data.  Only requires two copies. */
00366     *pStateCurnt++ = *pSrc++;
00367     *pStateCurnt++ = *pSrc++;
00368     *pStateCurnt++ = *pSrc++;
00369     *pStateCurnt++ = *pSrc++;
00370 
00371 
00372     /* Set all accumulators to zero */
00373     acc0 = 0;
00374     acc1 = 0;
00375     acc2 = 0;
00376     acc3 = 0;
00377 
00378     /* Typecast q15_t pointer to q31_t pointer for state reading in q31_t */
00379     px = pState;
00380 
00381     /* Typecast q15_t pointer to q31_t pointer for coefficient reading in q31_t */
00382     pb = pCoeffs;
00383 
00384     /* Read the first two samples from the state buffer:  x[n-N], x[n-N-1] */
00385     x0 = *__SIMD32(px)++;
00386 
00387     /* Read the third and forth samples from the state buffer: x[n-N-2], x[n-N-3] */
00388     x2 = *__SIMD32(px)++;
00389 
00390     /* Loop over the number of taps.  Unroll by a factor of 4.      
00391      ** Repeat until we've computed numTaps-(numTaps%4) coefficients. */
00392     tapCnt = numTaps >> 2;
00393 
00394     while(tapCnt > 0)
00395     {
00396       /* Read the first two coefficients using SIMD:  b[N] and b[N-1] coefficients */
00397       c0 = *__SIMD32(pb)++;
00398 
00399       /* acc0 +=  b[N] * x[n-N] + b[N-1] * x[n-N-1] */
00400       acc0 = __SMLALD(x0, c0, acc0);
00401 
00402       /* acc2 +=  b[N] * x[n-N-2] + b[N-1] * x[n-N-3] */
00403       acc2 = __SMLALD(x2, c0, acc2);
00404 
00405       /* pack  x[n-N-1] and x[n-N-2] */
00406 #ifndef ARM_MATH_BIG_ENDIAN
00407       x1 = __PKHBT(x2, x0, 0);
00408 #else
00409       x1 = __PKHBT(x0, x2, 0);
00410 #endif
00411 
00412       /* Read state x[n-N-4], x[n-N-5] */
00413       x0 = _SIMD32_OFFSET(px);
00414 
00415       /* acc1 +=  b[N] * x[n-N-1] + b[N-1] * x[n-N-2] */
00416       acc1 = __SMLALDX(x1, c0, acc1);
00417 
00418       /* pack  x[n-N-3] and x[n-N-4] */
00419 #ifndef ARM_MATH_BIG_ENDIAN
00420       x1 = __PKHBT(x0, x2, 0);
00421 #else
00422       x1 = __PKHBT(x2, x0, 0);
00423 #endif
00424 
00425       /* acc3 +=  b[N] * x[n-N-3] + b[N-1] * x[n-N-4] */
00426       acc3 = __SMLALDX(x1, c0, acc3);
00427 
00428       /* Read coefficients b[N-2], b[N-3] */
00429       c0 = *__SIMD32(pb)++;
00430 
00431       /* acc0 +=  b[N-2] * x[n-N-2] + b[N-3] * x[n-N-3] */
00432       acc0 = __SMLALD(x2, c0, acc0);
00433 
00434       /* Read state x[n-N-6], x[n-N-7] with offset */
00435       x2 = _SIMD32_OFFSET(px + 2u);
00436 
00437       /* acc2 +=  b[N-2] * x[n-N-4] + b[N-3] * x[n-N-5] */
00438       acc2 = __SMLALD(x0, c0, acc2);
00439 
00440       /* acc1 +=  b[N-2] * x[n-N-3] + b[N-3] * x[n-N-4] */
00441       acc1 = __SMLALDX(x1, c0, acc1);
00442 
00443       /* pack  x[n-N-5] and x[n-N-6] */
00444 #ifndef ARM_MATH_BIG_ENDIAN
00445       x1 = __PKHBT(x2, x0, 0);
00446 #else
00447       x1 = __PKHBT(x0, x2, 0);
00448 #endif
00449 
00450       /* acc3 +=  b[N-2] * x[n-N-5] + b[N-3] * x[n-N-6] */
00451       acc3 = __SMLALDX(x1, c0, acc3);
00452 
00453       /* Update state pointer for next state reading */
00454       px += 4u;
00455 
00456       /* Decrement tap count */
00457       tapCnt--;
00458 
00459     }
00460 
00461     /* If the filter length is not a multiple of 4, compute the remaining filter taps.       
00462      ** This is always be 2 taps since the filter length is even. */
00463     if((numTaps & 0x3u) != 0u)
00464     {
00465 
00466       /* Read last two coefficients */
00467       c0 = *__SIMD32(pb)++;
00468 
00469       /* Perform the multiply-accumulates */
00470       acc0 = __SMLALD(x0, c0, acc0);
00471       acc2 = __SMLALD(x2, c0, acc2);
00472 
00473       /* pack state variables */
00474 #ifndef ARM_MATH_BIG_ENDIAN
00475       x1 = __PKHBT(x2, x0, 0);
00476 #else
00477       x1 = __PKHBT(x0, x2, 0);
00478 #endif
00479 
00480       /* Read last state variables */
00481       x0 = *__SIMD32(px);
00482 
00483       /* Perform the multiply-accumulates */
00484       acc1 = __SMLALDX(x1, c0, acc1);
00485 
00486       /* pack state variables */
00487 #ifndef ARM_MATH_BIG_ENDIAN
00488       x1 = __PKHBT(x0, x2, 0);
00489 #else
00490       x1 = __PKHBT(x2, x0, 0);
00491 #endif
00492 
00493       /* Perform the multiply-accumulates */
00494       acc3 = __SMLALDX(x1, c0, acc3);
00495     }
00496 
00497     /* The results in the 4 accumulators are in 2.30 format.  Convert to 1.15 with saturation.       
00498      ** Then store the 4 outputs in the destination buffer. */
00499 
00500 #ifndef ARM_MATH_BIG_ENDIAN
00501 
00502     *__SIMD32(pDst)++ =
00503       __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
00504 
00505     *__SIMD32(pDst)++ =
00506       __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
00507 
00508 #else
00509 
00510     *__SIMD32(pDst)++ =
00511       __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
00512 
00513     *__SIMD32(pDst)++ =
00514       __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
00515 
00516 #endif /*      #ifndef ARM_MATH_BIG_ENDIAN       */
00517 
00518     /* Advance the state pointer by 4 to process the next group of 4 samples */
00519     pState = pState + 4;
00520 
00521     /* Decrement the loop counter */
00522     blkCnt--;
00523   }
00524 
00525   /* If the blockSize is not a multiple of 4, compute any remaining output samples here.      
00526    ** No loop unrolling is used. */
00527   blkCnt = blockSize % 0x4u;
00528   while(blkCnt > 0u)
00529   {
00530     /* Copy two samples into state buffer */
00531     *pStateCurnt++ = *pSrc++;
00532 
00533     /* Set the accumulator to zero */
00534     acc0 = 0;
00535 
00536     /* Use SIMD to hold states and coefficients */
00537     px = pState;
00538     pb = pCoeffs;
00539 
00540     tapCnt = numTaps >> 1u;
00541 
00542     do
00543     {
00544       acc0 += (q31_t) * px++ * *pb++;
00545       acc0 += (q31_t) * px++ * *pb++;
00546       tapCnt--;
00547     }
00548     while(tapCnt > 0u);
00549 
00550     /* The result is in 2.30 format.  Convert to 1.15 with saturation.      
00551      ** Then store the output in the destination buffer. */
00552     *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
00553 
00554     /* Advance state pointer by 1 for the next sample */
00555     pState = pState + 1u;
00556 
00557     /* Decrement the loop counter */
00558     blkCnt--;
00559   }
00560 
00561   /* Processing is complete.      
00562    ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.      
00563    ** This prepares the state buffer for the next function call. */
00564 
00565   /* Points to the start of the state buffer */
00566   pStateCurnt = S->pState;
00567 
00568   /* Calculation of count for copying integer writes */
00569   tapCnt = (numTaps - 1u) >> 2;
00570 
00571   while(tapCnt > 0u)
00572   {
00573     *pStateCurnt++ = *pState++;
00574     *pStateCurnt++ = *pState++;
00575     *pStateCurnt++ = *pState++;
00576     *pStateCurnt++ = *pState++;
00577 
00578     tapCnt--;
00579 
00580   }
00581 
00582   /* Calculation of count for remaining q15_t data */
00583   tapCnt = (numTaps - 1u) % 0x4u;
00584 
00585   /* copy remaining data */
00586   while(tapCnt > 0u)
00587   {
00588     *pStateCurnt++ = *pState++;
00589 
00590     /* Decrement the loop counter */
00591     tapCnt--;
00592   }
00593 }
00594 
00595 
00596 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
00597 
00598 #else /* ARM_MATH_CM0_FAMILY */
00599 
00600 
00601 /* Run the below code for Cortex-M0 */
00602 
00603 void arm_fir_q15(
00604   const arm_fir_instance_q15 * S,
00605   q15_t * pSrc,
00606   q15_t * pDst,
00607   uint32_t blockSize)
00608 {
00609   q15_t *pState = S->pState;                     /* State pointer */
00610   q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
00611   q15_t *pStateCurnt;                            /* Points to the current sample of the state */
00612 
00613 
00614 
00615   q15_t *px;                                     /* Temporary pointer for state buffer */
00616   q15_t *pb;                                     /* Temporary pointer for coefficient buffer */
00617   q63_t acc;                                     /* Accumulator */
00618   uint32_t numTaps = S->numTaps;                 /* Number of nTaps in the filter */
00619   uint32_t tapCnt, blkCnt;                       /* Loop counters */
00620 
00621   /* S->pState buffer contains previous frame (numTaps - 1) samples */
00622   /* pStateCurnt points to the location where the new input data should be written */
00623   pStateCurnt = &(S->pState[(numTaps - 1u)]);
00624 
00625   /* Initialize blkCnt with blockSize */
00626   blkCnt = blockSize;
00627 
00628   while(blkCnt > 0u)
00629   {
00630     /* Copy one sample at a time into state buffer */
00631     *pStateCurnt++ = *pSrc++;
00632 
00633     /* Set the accumulator to zero */
00634     acc = 0;
00635 
00636     /* Initialize state pointer */
00637     px = pState;
00638 
00639     /* Initialize Coefficient pointer */
00640     pb = pCoeffs;
00641 
00642     tapCnt = numTaps;
00643 
00644     /* Perform the multiply-accumulates */
00645     do
00646     {
00647       /* acc =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
00648       acc += (q31_t) * px++ * *pb++;
00649       tapCnt--;
00650     } while(tapCnt > 0u);
00651 
00652     /* The result is in 2.30 format.  Convert to 1.15         
00653      ** Then store the output in the destination buffer. */
00654     *pDst++ = (q15_t) __SSAT((acc >> 15u), 16);
00655 
00656     /* Advance state pointer by 1 for the next sample */
00657     pState = pState + 1;
00658 
00659     /* Decrement the samples loop counter */
00660     blkCnt--;
00661   }
00662 
00663   /* Processing is complete.         
00664    ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.       
00665    ** This prepares the state buffer for the next function call. */
00666 
00667   /* Points to the start of the state buffer */
00668   pStateCurnt = S->pState;
00669 
00670   /* Copy numTaps number of values */
00671   tapCnt = (numTaps - 1u);
00672 
00673   /* copy data */
00674   while(tapCnt > 0u)
00675   {
00676     *pStateCurnt++ = *pState++;
00677 
00678     /* Decrement the loop counter */
00679     tapCnt--;
00680   }
00681 
00682 }
00683 
00684 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
00685 
00686 
00687 
00688 
00689 /**       
00690  * @} end of FIR group       
00691  */