Robert Lopez / CMSIS5
Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_fir_decimate_fast_q15.c Source File

arm_fir_decimate_fast_q15.c

00001 /* ----------------------------------------------------------------------
00002  * Project:      CMSIS DSP Library
00003  * Title:        arm_fir_decimate_fast_q15.c
00004  * Description:  Fast Q15 FIR Decimator
00005  *
00006  * $Date:        27. January 2017
00007  * $Revision:    V.1.5.1
00008  *
00009  * Target Processor: Cortex-M cores
00010  * -------------------------------------------------------------------- */
00011 /*
00012  * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
00013  *
00014  * SPDX-License-Identifier: Apache-2.0
00015  *
00016  * Licensed under the Apache License, Version 2.0 (the License); you may
00017  * not use this file except in compliance with the License.
00018  * You may obtain a copy of the License at
00019  *
00020  * www.apache.org/licenses/LICENSE-2.0
00021  *
00022  * Unless required by applicable law or agreed to in writing, software
00023  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
00024  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00025  * See the License for the specific language governing permissions and
00026  * limitations under the License.
00027  */
00028 
00029 #include "arm_math.h"
00030 
00031 /**
00032  * @ingroup groupFilters
00033  */
00034 
00035 /**
00036  * @addtogroup FIR_decimate
00037  * @{
00038  */
00039 
00040 /**
00041  * @brief Processing function for the Q15 FIR decimator (fast variant) for Cortex-M3 and Cortex-M4.
00042  * @param[in] *S points to an instance of the Q15 FIR decimator structure.
00043  * @param[in] *pSrc points to the block of input data.
00044  * @param[out] *pDst points to the block of output data
00045  * @param[in] blockSize number of input samples to process per call.
00046  * @return none
00047  *
00048  * \par Restrictions
00049  *  If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
00050  *  In this case input, output, state buffers should be aligned by 32-bit
00051  *
00052  * <b>Scaling and Overflow Behavior:</b>
00053  * \par
00054  * This fast version uses a 32-bit accumulator with 2.30 format.
00055  * The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit.
00056  * Thus, if the accumulator result overflows it wraps around and distorts the result.
00057  * In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits (log2 is read as log to the base 2).
00058  * The 2.30 accumulator is then truncated to 2.15 format and saturated to yield the 1.15 result.
00059  *
00060  * \par
00061  * Refer to the function <code>arm_fir_decimate_q15()</code> for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion.
00062  * Both the slow and the fast versions use the same instance structure.
00063  * Use the function <code>arm_fir_decimate_init_q15()</code> to initialize the filter structure.
00064  */
00065 
00066 #ifndef UNALIGNED_SUPPORT_DISABLE
00067 
00068 void arm_fir_decimate_fast_q15(
00069   const arm_fir_decimate_instance_q15 * S,
00070   q15_t * pSrc,
00071   q15_t * pDst,
00072   uint32_t blockSize)
00073 {
00074   q15_t *pState = S->pState;                     /* State pointer */
00075   q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
00076   q15_t *pStateCurnt;                            /* Points to the current sample of the state */
00077   q15_t *px;                                     /* Temporary pointer for state buffer */
00078   q15_t *pb;                                     /* Temporary pointer coefficient buffer */
00079   q31_t x0, x1, c0, c1;                          /* Temporary variables to hold state and coefficient values */
00080   q31_t sum0;                                    /* Accumulators */
00081   q31_t acc0, acc1;
00082   q15_t *px0, *px1;
00083   uint32_t blkCntN3;
00084   uint32_t numTaps = S->numTaps;                 /* Number of taps */
00085   uint32_t i, blkCnt, tapCnt, outBlockSize = blockSize / S->M;  /* Loop counters */
00086 
00087 
00088   /* S->pState buffer contains previous frame (numTaps - 1) samples */
00089   /* pStateCurnt points to the location where the new input data should be written */
00090   pStateCurnt = S->pState + (numTaps - 1U);
00091 
00092 
00093   /* Total number of output samples to be computed */
00094   blkCnt = outBlockSize / 2;
00095   blkCntN3 = outBlockSize - (2 * blkCnt);
00096 
00097 
00098   while (blkCnt > 0U)
00099   {
00100     /* Copy decimation factor number of new input samples into the state buffer */
00101     i = 2 * S->M;
00102 
00103     do
00104     {
00105       *pStateCurnt++ = *pSrc++;
00106 
00107     } while (--i);
00108 
00109     /* Set accumulator to zero */
00110     acc0 = 0;
00111     acc1 = 0;
00112 
00113     /* Initialize state pointer */
00114     px0 = pState;
00115 
00116     px1 = pState + S->M;
00117 
00118 
00119     /* Initialize coeff pointer */
00120     pb = pCoeffs;
00121 
00122     /* Loop unrolling.  Process 4 taps at a time. */
00123     tapCnt = numTaps >> 2;
00124 
00125     /* Loop over the number of taps.  Unroll by a factor of 4.
00126      ** Repeat until we've computed numTaps-4 coefficients. */
00127     while (tapCnt > 0U)
00128     {
00129       /* Read the Read b[numTaps-1] and b[numTaps-2]  coefficients */
00130       c0 = *__SIMD32(pb)++;
00131 
00132       /* Read x[n-numTaps-1] and x[n-numTaps-2]sample */
00133       x0 = *__SIMD32(px0)++;
00134 
00135       x1 = *__SIMD32(px1)++;
00136 
00137       /* Perform the multiply-accumulate */
00138       acc0 = __SMLAD(x0, c0, acc0);
00139 
00140       acc1 = __SMLAD(x1, c0, acc1);
00141 
00142       /* Read the b[numTaps-3] and b[numTaps-4] coefficient */
00143       c0 = *__SIMD32(pb)++;
00144 
00145       /* Read x[n-numTaps-2] and x[n-numTaps-3] sample */
00146       x0 = *__SIMD32(px0)++;
00147 
00148       x1 = *__SIMD32(px1)++;
00149 
00150       /* Perform the multiply-accumulate */
00151       acc0 = __SMLAD(x0, c0, acc0);
00152 
00153       acc1 = __SMLAD(x1, c0, acc1);
00154 
00155       /* Decrement the loop counter */
00156       tapCnt--;
00157     }
00158 
00159     /* If the filter length is not a multiple of 4, compute the remaining filter taps */
00160     tapCnt = numTaps % 0x4U;
00161 
00162     while (tapCnt > 0U)
00163     {
00164       /* Read coefficients */
00165       c0 = *pb++;
00166 
00167       /* Fetch 1 state variable */
00168       x0 = *px0++;
00169 
00170       x1 = *px1++;
00171 
00172       /* Perform the multiply-accumulate */
00173       acc0 = __SMLAD(x0, c0, acc0);
00174       acc1 = __SMLAD(x1, c0, acc1);
00175 
00176       /* Decrement the loop counter */
00177       tapCnt--;
00178     }
00179 
00180     /* Advance the state pointer by the decimation factor
00181      * to process the next group of decimation factor number samples */
00182     pState = pState + S->M * 2;
00183 
00184     /* Store filter output, smlad returns the values in 2.14 format */
00185     /* so downsacle by 15 to get output in 1.15 */
00186     *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
00187     *pDst++ = (q15_t) (__SSAT((acc1 >> 15), 16));
00188 
00189     /* Decrement the loop counter */
00190     blkCnt--;
00191   }
00192 
00193 
00194 
00195   while (blkCntN3 > 0U)
00196   {
00197     /* Copy decimation factor number of new input samples into the state buffer */
00198     i = S->M;
00199 
00200     do
00201     {
00202       *pStateCurnt++ = *pSrc++;
00203 
00204     } while (--i);
00205 
00206     /*Set sum to zero */
00207     sum0 = 0;
00208 
00209     /* Initialize state pointer */
00210     px = pState;
00211 
00212     /* Initialize coeff pointer */
00213     pb = pCoeffs;
00214 
00215     /* Loop unrolling.  Process 4 taps at a time. */
00216     tapCnt = numTaps >> 2;
00217 
00218     /* Loop over the number of taps.  Unroll by a factor of 4.
00219      ** Repeat until we've computed numTaps-4 coefficients. */
00220     while (tapCnt > 0U)
00221     {
00222       /* Read the Read b[numTaps-1] and b[numTaps-2]  coefficients */
00223       c0 = *__SIMD32(pb)++;
00224 
00225       /* Read x[n-numTaps-1] and x[n-numTaps-2]sample */
00226       x0 = *__SIMD32(px)++;
00227 
00228       /* Read the b[numTaps-3] and b[numTaps-4] coefficient */
00229       c1 = *__SIMD32(pb)++;
00230 
00231       /* Perform the multiply-accumulate */
00232       sum0 = __SMLAD(x0, c0, sum0);
00233 
00234       /* Read x[n-numTaps-2] and x[n-numTaps-3] sample */
00235       x0 = *__SIMD32(px)++;
00236 
00237       /* Perform the multiply-accumulate */
00238       sum0 = __SMLAD(x0, c1, sum0);
00239 
00240       /* Decrement the loop counter */
00241       tapCnt--;
00242     }
00243 
00244     /* If the filter length is not a multiple of 4, compute the remaining filter taps */
00245     tapCnt = numTaps % 0x4U;
00246 
00247     while (tapCnt > 0U)
00248     {
00249       /* Read coefficients */
00250       c0 = *pb++;
00251 
00252       /* Fetch 1 state variable */
00253       x0 = *px++;
00254 
00255       /* Perform the multiply-accumulate */
00256       sum0 = __SMLAD(x0, c0, sum0);
00257 
00258       /* Decrement the loop counter */
00259       tapCnt--;
00260     }
00261 
00262     /* Advance the state pointer by the decimation factor
00263      * to process the next group of decimation factor number samples */
00264     pState = pState + S->M;
00265 
00266     /* Store filter output, smlad returns the values in 2.14 format */
00267     /* so downsacle by 15 to get output in 1.15 */
00268     *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16));
00269 
00270     /* Decrement the loop counter */
00271     blkCntN3--;
00272   }
00273 
00274   /* Processing is complete.
00275    ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
00276    ** This prepares the state buffer for the next function call. */
00277 
00278   /* Points to the start of the state buffer */
00279   pStateCurnt = S->pState;
00280 
00281   i = (numTaps - 1U) >> 2U;
00282 
00283   /* copy data */
00284   while (i > 0U)
00285   {
00286     *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++;
00287     *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++;
00288 
00289     /* Decrement the loop counter */
00290     i--;
00291   }
00292 
00293   i = (numTaps - 1U) % 0x04U;
00294 
00295   /* copy data */
00296   while (i > 0U)
00297   {
00298     *pStateCurnt++ = *pState++;
00299 
00300     /* Decrement the loop counter */
00301     i--;
00302   }
00303 }
00304 
00305 #else
00306 
00307 
00308 void arm_fir_decimate_fast_q15(
00309   const arm_fir_decimate_instance_q15 * S,
00310   q15_t * pSrc,
00311   q15_t * pDst,
00312   uint32_t blockSize)
00313 {
00314   q15_t *pState = S->pState;                     /* State pointer */
00315   q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
00316   q15_t *pStateCurnt;                            /* Points to the current sample of the state */
00317   q15_t *px;                                     /* Temporary pointer for state buffer */
00318   q15_t *pb;                                     /* Temporary pointer coefficient buffer */
00319   q15_t x0, x1, c0;                              /* Temporary variables to hold state and coefficient values */
00320   q31_t sum0;                                    /* Accumulators */
00321   q31_t acc0, acc1;
00322   q15_t *px0, *px1;
00323   uint32_t blkCntN3;
00324   uint32_t numTaps = S->numTaps;                 /* Number of taps */
00325   uint32_t i, blkCnt, tapCnt, outBlockSize = blockSize / S->M;  /* Loop counters */
00326 
00327 
00328   /* S->pState buffer contains previous frame (numTaps - 1) samples */
00329   /* pStateCurnt points to the location where the new input data should be written */
00330   pStateCurnt = S->pState + (numTaps - 1U);
00331 
00332 
00333   /* Total number of output samples to be computed */
00334   blkCnt = outBlockSize / 2;
00335   blkCntN3 = outBlockSize - (2 * blkCnt);
00336 
00337   while (blkCnt > 0U)
00338   {
00339     /* Copy decimation factor number of new input samples into the state buffer */
00340     i = 2 * S->M;
00341 
00342     do
00343     {
00344       *pStateCurnt++ = *pSrc++;
00345 
00346     } while (--i);
00347 
00348     /* Set accumulator to zero */
00349     acc0 = 0;
00350     acc1 = 0;
00351 
00352     /* Initialize state pointer */
00353     px0 = pState;
00354 
00355     px1 = pState + S->M;
00356 
00357 
00358     /* Initialize coeff pointer */
00359     pb = pCoeffs;
00360 
00361     /* Loop unrolling.  Process 4 taps at a time. */
00362     tapCnt = numTaps >> 2;
00363 
00364     /* Loop over the number of taps.  Unroll by a factor of 4.
00365      ** Repeat until we've computed numTaps-4 coefficients. */
00366     while (tapCnt > 0U)
00367     {
00368       /* Read the Read b[numTaps-1] coefficients */
00369       c0 = *pb++;
00370 
00371       /* Read x[n-numTaps-1] for sample 0 and for sample 1 */
00372       x0 = *px0++;
00373       x1 = *px1++;
00374 
00375       /* Perform the multiply-accumulate */
00376       acc0 += x0 * c0;
00377       acc1 += x1 * c0;
00378 
00379       /* Read the b[numTaps-2] coefficient */
00380       c0 = *pb++;
00381 
00382       /* Read x[n-numTaps-2] for sample 0 and sample 1 */
00383       x0 = *px0++;
00384       x1 = *px1++;
00385 
00386       /* Perform the multiply-accumulate */
00387       acc0 += x0 * c0;
00388       acc1 += x1 * c0;
00389 
00390       /* Read the b[numTaps-3]  coefficients */
00391       c0 = *pb++;
00392 
00393       /* Read x[n-numTaps-3] for sample 0 and sample 1 */
00394       x0 = *px0++;
00395       x1 = *px1++;
00396 
00397       /* Perform the multiply-accumulate */
00398       acc0 += x0 * c0;
00399       acc1 += x1 * c0;
00400 
00401       /* Read the b[numTaps-4] coefficient */
00402       c0 = *pb++;
00403 
00404       /* Read x[n-numTaps-4] for sample 0 and sample 1 */
00405       x0 = *px0++;
00406       x1 = *px1++;
00407 
00408       /* Perform the multiply-accumulate */
00409       acc0 += x0 * c0;
00410       acc1 += x1 * c0;
00411 
00412       /* Decrement the loop counter */
00413       tapCnt--;
00414     }
00415 
00416     /* If the filter length is not a multiple of 4, compute the remaining filter taps */
00417     tapCnt = numTaps % 0x4U;
00418 
00419     while (tapCnt > 0U)
00420     {
00421       /* Read coefficients */
00422       c0 = *pb++;
00423 
00424       /* Fetch 1 state variable */
00425       x0 = *px0++;
00426       x1 = *px1++;
00427 
00428       /* Perform the multiply-accumulate */
00429       acc0 += x0 * c0;
00430       acc1 += x1 * c0;
00431 
00432       /* Decrement the loop counter */
00433       tapCnt--;
00434     }
00435 
00436     /* Advance the state pointer by the decimation factor
00437      * to process the next group of decimation factor number samples */
00438     pState = pState + S->M * 2;
00439 
00440     /* Store filter output, smlad returns the values in 2.14 format */
00441     /* so downsacle by 15 to get output in 1.15 */
00442 
00443     *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
00444     *pDst++ = (q15_t) (__SSAT((acc1 >> 15), 16));
00445 
00446 
00447     /* Decrement the loop counter */
00448     blkCnt--;
00449   }
00450 
00451   while (blkCntN3 > 0U)
00452   {
00453     /* Copy decimation factor number of new input samples into the state buffer */
00454     i = S->M;
00455 
00456     do
00457     {
00458       *pStateCurnt++ = *pSrc++;
00459 
00460     } while (--i);
00461 
00462     /*Set sum to zero */
00463     sum0 = 0;
00464 
00465     /* Initialize state pointer */
00466     px = pState;
00467 
00468     /* Initialize coeff pointer */
00469     pb = pCoeffs;
00470 
00471     /* Loop unrolling.  Process 4 taps at a time. */
00472     tapCnt = numTaps >> 2;
00473 
00474     /* Loop over the number of taps.  Unroll by a factor of 4.
00475      ** Repeat until we've computed numTaps-4 coefficients. */
00476     while (tapCnt > 0U)
00477     {
00478       /* Read the Read b[numTaps-1] coefficients */
00479       c0 = *pb++;
00480 
00481       /* Read x[n-numTaps-1] and sample */
00482       x0 = *px++;
00483 
00484       /* Perform the multiply-accumulate */
00485       sum0 += x0 * c0;
00486 
00487       /* Read the b[numTaps-2] coefficient */
00488       c0 = *pb++;
00489 
00490       /* Read x[n-numTaps-2] and  sample */
00491       x0 = *px++;
00492 
00493       /* Perform the multiply-accumulate */
00494       sum0 += x0 * c0;
00495 
00496       /* Read the b[numTaps-3]  coefficients */
00497       c0 = *pb++;
00498 
00499       /* Read x[n-numTaps-3] sample */
00500       x0 = *px++;
00501 
00502       /* Perform the multiply-accumulate */
00503       sum0 += x0 * c0;
00504 
00505       /* Read the b[numTaps-4] coefficient */
00506       c0 = *pb++;
00507 
00508       /* Read x[n-numTaps-4] sample */
00509       x0 = *px++;
00510 
00511       /* Perform the multiply-accumulate */
00512       sum0 += x0 * c0;
00513 
00514       /* Decrement the loop counter */
00515       tapCnt--;
00516     }
00517 
00518     /* If the filter length is not a multiple of 4, compute the remaining filter taps */
00519     tapCnt = numTaps % 0x4U;
00520 
00521     while (tapCnt > 0U)
00522     {
00523       /* Read coefficients */
00524       c0 = *pb++;
00525 
00526       /* Fetch 1 state variable */
00527       x0 = *px++;
00528 
00529       /* Perform the multiply-accumulate */
00530       sum0 += x0 * c0;
00531 
00532       /* Decrement the loop counter */
00533       tapCnt--;
00534     }
00535 
00536     /* Advance the state pointer by the decimation factor
00537      * to process the next group of decimation factor number samples */
00538     pState = pState + S->M;
00539 
00540     /* Store filter output, smlad returns the values in 2.14 format */
00541     /* so downsacle by 15 to get output in 1.15 */
00542     *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16));
00543 
00544     /* Decrement the loop counter */
00545     blkCntN3--;
00546   }
00547 
00548   /* Processing is complete.
00549    ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
00550    ** This prepares the state buffer for the next function call. */
00551 
00552   /* Points to the start of the state buffer */
00553   pStateCurnt = S->pState;
00554 
00555   i = (numTaps - 1U) >> 2U;
00556 
00557   /* copy data */
00558   while (i > 0U)
00559   {
00560     *pStateCurnt++ = *pState++;
00561     *pStateCurnt++ = *pState++;
00562     *pStateCurnt++ = *pState++;
00563     *pStateCurnt++ = *pState++;
00564 
00565     /* Decrement the loop counter */
00566     i--;
00567   }
00568 
00569   i = (numTaps - 1U) % 0x04U;
00570 
00571   /* copy data */
00572   while (i > 0U)
00573   {
00574     *pStateCurnt++ = *pState++;
00575 
00576     /* Decrement the loop counter */
00577     i--;
00578   }
00579 }
00580 
00581 
00582 #endif  /*  #ifndef UNALIGNED_SUPPORT_DISABLE   */
00583 
00584 /**
00585  * @} end of FIR_decimate group
00586  */
00587