Aded CMSIS5 DSP and NN folder. Needs some work

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_fir_decimate_q15.c Source File

arm_fir_decimate_q15.c

00001 /* ----------------------------------------------------------------------
00002  * Project:      CMSIS DSP Library
00003  * Title:        arm_fir_decimate_q15.c
00004  * Description:  Q15 FIR Decimator
00005  *
00006  * $Date:        27. January 2017
00007  * $Revision:    V.1.5.1
00008  *
00009  * Target Processor: Cortex-M cores
00010  * -------------------------------------------------------------------- */
00011 /*
00012  * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
00013  *
00014  * SPDX-License-Identifier: Apache-2.0
00015  *
00016  * Licensed under the Apache License, Version 2.0 (the License); you may
00017  * not use this file except in compliance with the License.
00018  * You may obtain a copy of the License at
00019  *
00020  * www.apache.org/licenses/LICENSE-2.0
00021  *
00022  * Unless required by applicable law or agreed to in writing, software
00023  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
00024  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00025  * See the License for the specific language governing permissions and
00026  * limitations under the License.
00027  */
00028 
00029 #include "arm_math.h"
00030 
00031 /**
00032  * @ingroup groupFilters
00033  */
00034 
00035 /**
00036  * @addtogroup FIR_decimate
00037  * @{
00038  */
00039 
00040 /**
00041  * @brief Processing function for the Q15 FIR decimator.
00042  * @param[in] *S points to an instance of the Q15 FIR decimator structure.
00043  * @param[in] *pSrc points to the block of input data.
00044  * @param[out] *pDst points to the location where the output result is written.
00045  * @param[in] blockSize number of input samples to process per call.
00046  * @return none.
00047  *
00048  * <b>Scaling and Overflow Behavior:</b>
00049  * \par
00050  * The function is implemented using a 64-bit internal accumulator.
00051  * Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result.
00052  * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
00053  * There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
00054  * After all additions have been performed, the accumulator is truncated to 34.15 format by discarding low 15 bits.
00055  * Lastly, the accumulator is saturated to yield a result in 1.15 format.
00056  *
00057  * \par
00058  * Refer to the function <code>arm_fir_decimate_fast_q15()</code> for a faster but less precise implementation of this function for Cortex-M3 and Cortex-M4.
00059  */
00060 
00061 #if defined (ARM_MATH_DSP)
00062 
00063 #ifndef UNALIGNED_SUPPORT_DISABLE
00064 
00065 void arm_fir_decimate_q15(
00066   const arm_fir_decimate_instance_q15 * S,
00067   q15_t * pSrc,
00068   q15_t * pDst,
00069   uint32_t blockSize)
00070 {
00071   q15_t *pState = S->pState;                     /* State pointer */
00072   q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
00073   q15_t *pStateCurnt;                            /* Points to the current sample of the state */
00074   q15_t *px;                                     /* Temporary pointer for state buffer */
00075   q15_t *pb;                                     /* Temporary pointer coefficient buffer */
00076   q31_t x0, x1, c0, c1;                          /* Temporary variables to hold state and coefficient values */
00077   q63_t sum0;                                    /* Accumulators */
00078   q63_t acc0, acc1;
00079   q15_t *px0, *px1;
00080   uint32_t blkCntN3;
00081   uint32_t numTaps = S->numTaps;                 /* Number of taps */
00082   uint32_t i, blkCnt, tapCnt, outBlockSize = blockSize / S->M;  /* Loop counters */
00083 
00084 
00085   /* S->pState buffer contains previous frame (numTaps - 1) samples */
00086   /* pStateCurnt points to the location where the new input data should be written */
00087   pStateCurnt = S->pState + (numTaps - 1U);
00088 
00089 
00090   /* Total number of output samples to be computed */
00091   blkCnt = outBlockSize / 2;
00092   blkCntN3 = outBlockSize - (2 * blkCnt);
00093 
00094 
00095   while (blkCnt > 0U)
00096   {
00097     /* Copy decimation factor number of new input samples into the state buffer */
00098     i = 2 * S->M;
00099 
00100     do
00101     {
00102       *pStateCurnt++ = *pSrc++;
00103 
00104     } while (--i);
00105 
00106     /* Set accumulator to zero */
00107     acc0 = 0;
00108     acc1 = 0;
00109 
00110     /* Initialize state pointer */
00111     px0 = pState;
00112 
00113     px1 = pState + S->M;
00114 
00115 
00116     /* Initialize coeff pointer */
00117     pb = pCoeffs;
00118 
00119     /* Loop unrolling.  Process 4 taps at a time. */
00120     tapCnt = numTaps >> 2;
00121 
00122     /* Loop over the number of taps.  Unroll by a factor of 4.
00123      ** Repeat until we've computed numTaps-4 coefficients. */
00124     while (tapCnt > 0U)
00125     {
00126       /* Read the Read b[numTaps-1] and b[numTaps-2]  coefficients */
00127       c0 = *__SIMD32(pb)++;
00128 
00129       /* Read x[n-numTaps-1] and x[n-numTaps-2]sample */
00130       x0 = *__SIMD32(px0)++;
00131 
00132       x1 = *__SIMD32(px1)++;
00133 
00134       /* Perform the multiply-accumulate */
00135       acc0 = __SMLALD(x0, c0, acc0);
00136 
00137       acc1 = __SMLALD(x1, c0, acc1);
00138 
00139       /* Read the b[numTaps-3] and b[numTaps-4] coefficient */
00140       c0 = *__SIMD32(pb)++;
00141 
00142       /* Read x[n-numTaps-2] and x[n-numTaps-3] sample */
00143       x0 = *__SIMD32(px0)++;
00144 
00145       x1 = *__SIMD32(px1)++;
00146 
00147       /* Perform the multiply-accumulate */
00148       acc0 = __SMLALD(x0, c0, acc0);
00149 
00150       acc1 = __SMLALD(x1, c0, acc1);
00151 
00152       /* Decrement the loop counter */
00153       tapCnt--;
00154     }
00155 
00156     /* If the filter length is not a multiple of 4, compute the remaining filter taps */
00157     tapCnt = numTaps % 0x4U;
00158 
00159     while (tapCnt > 0U)
00160     {
00161       /* Read coefficients */
00162       c0 = *pb++;
00163 
00164       /* Fetch 1 state variable */
00165       x0 = *px0++;
00166 
00167       x1 = *px1++;
00168 
00169       /* Perform the multiply-accumulate */
00170       acc0 = __SMLALD(x0, c0, acc0);
00171       acc1 = __SMLALD(x1, c0, acc1);
00172 
00173       /* Decrement the loop counter */
00174       tapCnt--;
00175     }
00176 
00177     /* Advance the state pointer by the decimation factor
00178      * to process the next group of decimation factor number samples */
00179     pState = pState + S->M * 2;
00180 
00181     /* Store filter output, smlad returns the values in 2.14 format */
00182     /* so downsacle by 15 to get output in 1.15 */
00183     *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
00184     *pDst++ = (q15_t) (__SSAT((acc1 >> 15), 16));
00185 
00186     /* Decrement the loop counter */
00187     blkCnt--;
00188   }
00189 
00190 
00191 
00192   while (blkCntN3 > 0U)
00193   {
00194     /* Copy decimation factor number of new input samples into the state buffer */
00195     i = S->M;
00196 
00197     do
00198     {
00199       *pStateCurnt++ = *pSrc++;
00200 
00201     } while (--i);
00202 
00203     /*Set sum to zero */
00204     sum0 = 0;
00205 
00206     /* Initialize state pointer */
00207     px = pState;
00208 
00209     /* Initialize coeff pointer */
00210     pb = pCoeffs;
00211 
00212     /* Loop unrolling.  Process 4 taps at a time. */
00213     tapCnt = numTaps >> 2;
00214 
00215     /* Loop over the number of taps.  Unroll by a factor of 4.
00216      ** Repeat until we've computed numTaps-4 coefficients. */
00217     while (tapCnt > 0U)
00218     {
00219       /* Read the Read b[numTaps-1] and b[numTaps-2]  coefficients */
00220       c0 = *__SIMD32(pb)++;
00221 
00222       /* Read x[n-numTaps-1] and x[n-numTaps-2]sample */
00223       x0 = *__SIMD32(px)++;
00224 
00225       /* Read the b[numTaps-3] and b[numTaps-4] coefficient */
00226       c1 = *__SIMD32(pb)++;
00227 
00228       /* Perform the multiply-accumulate */
00229       sum0 = __SMLALD(x0, c0, sum0);
00230 
00231       /* Read x[n-numTaps-2] and x[n-numTaps-3] sample */
00232       x0 = *__SIMD32(px)++;
00233 
00234       /* Perform the multiply-accumulate */
00235       sum0 = __SMLALD(x0, c1, sum0);
00236 
00237       /* Decrement the loop counter */
00238       tapCnt--;
00239     }
00240 
00241     /* If the filter length is not a multiple of 4, compute the remaining filter taps */
00242     tapCnt = numTaps % 0x4U;
00243 
00244     while (tapCnt > 0U)
00245     {
00246       /* Read coefficients */
00247       c0 = *pb++;
00248 
00249       /* Fetch 1 state variable */
00250       x0 = *px++;
00251 
00252       /* Perform the multiply-accumulate */
00253       sum0 = __SMLALD(x0, c0, sum0);
00254 
00255       /* Decrement the loop counter */
00256       tapCnt--;
00257     }
00258 
00259     /* Advance the state pointer by the decimation factor
00260      * to process the next group of decimation factor number samples */
00261     pState = pState + S->M;
00262 
00263     /* Store filter output, smlad returns the values in 2.14 format */
00264     /* so downsacle by 15 to get output in 1.15 */
00265     *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16));
00266 
00267     /* Decrement the loop counter */
00268     blkCntN3--;
00269   }
00270 
00271   /* Processing is complete.
00272    ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
00273    ** This prepares the state buffer for the next function call. */
00274 
00275   /* Points to the start of the state buffer */
00276   pStateCurnt = S->pState;
00277 
00278   i = (numTaps - 1U) >> 2U;
00279 
00280   /* copy data */
00281   while (i > 0U)
00282   {
00283     *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++;
00284     *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++;
00285 
00286     /* Decrement the loop counter */
00287     i--;
00288   }
00289 
00290   i = (numTaps - 1U) % 0x04U;
00291 
00292   /* copy data */
00293   while (i > 0U)
00294   {
00295     *pStateCurnt++ = *pState++;
00296 
00297     /* Decrement the loop counter */
00298     i--;
00299   }
00300 }
00301 
00302 #else
00303 
00304 
00305 void arm_fir_decimate_q15(
00306   const arm_fir_decimate_instance_q15 * S,
00307   q15_t * pSrc,
00308   q15_t * pDst,
00309   uint32_t blockSize)
00310 {
00311   q15_t *pState = S->pState;                     /* State pointer */
00312   q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
00313   q15_t *pStateCurnt;                            /* Points to the current sample of the state */
00314   q15_t *px;                                     /* Temporary pointer for state buffer */
00315   q15_t *pb;                                     /* Temporary pointer coefficient buffer */
00316   q15_t x0, x1, c0;                              /* Temporary variables to hold state and coefficient values */
00317   q63_t sum0;                                    /* Accumulators */
00318   q63_t acc0, acc1;
00319   q15_t *px0, *px1;
00320   uint32_t blkCntN3;
00321   uint32_t numTaps = S->numTaps;                 /* Number of taps */
00322   uint32_t i, blkCnt, tapCnt, outBlockSize = blockSize / S->M;  /* Loop counters */
00323 
00324 
00325   /* S->pState buffer contains previous frame (numTaps - 1) samples */
00326   /* pStateCurnt points to the location where the new input data should be written */
00327   pStateCurnt = S->pState + (numTaps - 1U);
00328 
00329 
00330   /* Total number of output samples to be computed */
00331   blkCnt = outBlockSize / 2;
00332   blkCntN3 = outBlockSize - (2 * blkCnt);
00333 
00334   while (blkCnt > 0U)
00335   {
00336     /* Copy decimation factor number of new input samples into the state buffer */
00337     i = 2 * S->M;
00338 
00339     do
00340     {
00341       *pStateCurnt++ = *pSrc++;
00342 
00343     } while (--i);
00344 
00345     /* Set accumulator to zero */
00346     acc0 = 0;
00347     acc1 = 0;
00348 
00349     /* Initialize state pointer */
00350     px0 = pState;
00351 
00352     px1 = pState + S->M;
00353 
00354 
00355     /* Initialize coeff pointer */
00356     pb = pCoeffs;
00357 
00358     /* Loop unrolling.  Process 4 taps at a time. */
00359     tapCnt = numTaps >> 2;
00360 
00361     /* Loop over the number of taps.  Unroll by a factor of 4.
00362      ** Repeat until we've computed numTaps-4 coefficients. */
00363     while (tapCnt > 0U)
00364     {
00365       /* Read the Read b[numTaps-1] coefficients */
00366       c0 = *pb++;
00367 
00368       /* Read x[n-numTaps-1] for sample 0 and for sample 1 */
00369       x0 = *px0++;
00370       x1 = *px1++;
00371 
00372       /* Perform the multiply-accumulate */
00373       acc0 += x0 * c0;
00374       acc1 += x1 * c0;
00375 
00376       /* Read the b[numTaps-2] coefficient */
00377       c0 = *pb++;
00378 
00379       /* Read x[n-numTaps-2] for sample 0 and sample 1 */
00380       x0 = *px0++;
00381       x1 = *px1++;
00382 
00383       /* Perform the multiply-accumulate */
00384       acc0 += x0 * c0;
00385       acc1 += x1 * c0;
00386 
00387       /* Read the b[numTaps-3] coefficients */
00388       c0 = *pb++;
00389 
00390       /* Read x[n-numTaps-3] for sample 0 and sample 1 */
00391       x0 = *px0++;
00392       x1 = *px1++;
00393 
00394       /* Perform the multiply-accumulate */
00395       acc0 += x0 * c0;
00396       acc1 += x1 * c0;
00397 
00398       /* Read the b[numTaps-4] coefficient */
00399       c0 = *pb++;
00400 
00401       /* Read x[n-numTaps-4] for sample 0 and sample 1 */
00402       x0 = *px0++;
00403       x1 = *px1++;
00404 
00405       /* Perform the multiply-accumulate */
00406       acc0 += x0 * c0;
00407       acc1 += x1 * c0;
00408 
00409       /* Decrement the loop counter */
00410       tapCnt--;
00411     }
00412 
00413     /* If the filter length is not a multiple of 4, compute the remaining filter taps */
00414     tapCnt = numTaps % 0x4U;
00415 
00416     while (tapCnt > 0U)
00417     {
00418       /* Read coefficients */
00419       c0 = *pb++;
00420 
00421       /* Fetch 1 state variable */
00422       x0 = *px0++;
00423       x1 = *px1++;
00424 
00425       /* Perform the multiply-accumulate */
00426       acc0 += x0 * c0;
00427       acc1 += x1 * c0;
00428 
00429       /* Decrement the loop counter */
00430       tapCnt--;
00431     }
00432 
00433     /* Advance the state pointer by the decimation factor
00434      * to process the next group of decimation factor number samples */
00435     pState = pState + S->M * 2;
00436 
00437     /* Store filter output, smlad returns the values in 2.14 format */
00438     /* so downsacle by 15 to get output in 1.15 */
00439 
00440     *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
00441     *pDst++ = (q15_t) (__SSAT((acc1 >> 15), 16));
00442 
00443     /* Decrement the loop counter */
00444     blkCnt--;
00445   }
00446 
00447   while (blkCntN3 > 0U)
00448   {
00449     /* Copy decimation factor number of new input samples into the state buffer */
00450     i = S->M;
00451 
00452     do
00453     {
00454       *pStateCurnt++ = *pSrc++;
00455 
00456     } while (--i);
00457 
00458     /*Set sum to zero */
00459     sum0 = 0;
00460 
00461     /* Initialize state pointer */
00462     px = pState;
00463 
00464     /* Initialize coeff pointer */
00465     pb = pCoeffs;
00466 
00467     /* Loop unrolling.  Process 4 taps at a time. */
00468     tapCnt = numTaps >> 2;
00469 
00470     /* Loop over the number of taps.  Unroll by a factor of 4.
00471      ** Repeat until we've computed numTaps-4 coefficients. */
00472     while (tapCnt > 0U)
00473     {
00474       /* Read the Read b[numTaps-1] coefficients */
00475       c0 = *pb++;
00476 
00477       /* Read x[n-numTaps-1] and sample */
00478       x0 = *px++;
00479 
00480       /* Perform the multiply-accumulate */
00481       sum0 += x0 * c0;
00482 
00483       /* Read the b[numTaps-2] coefficient */
00484       c0 = *pb++;
00485 
00486       /* Read x[n-numTaps-2] and  sample */
00487       x0 = *px++;
00488 
00489       /* Perform the multiply-accumulate */
00490       sum0 += x0 * c0;
00491 
00492       /* Read the b[numTaps-3]  coefficients */
00493       c0 = *pb++;
00494 
00495       /* Read x[n-numTaps-3] sample */
00496       x0 = *px++;
00497 
00498       /* Perform the multiply-accumulate */
00499       sum0 += x0 * c0;
00500 
00501       /* Read the b[numTaps-4] coefficient */
00502       c0 = *pb++;
00503 
00504       /* Read x[n-numTaps-4] sample */
00505       x0 = *px++;
00506 
00507       /* Perform the multiply-accumulate */
00508       sum0 += x0 * c0;
00509 
00510       /* Decrement the loop counter */
00511       tapCnt--;
00512     }
00513 
00514     /* If the filter length is not a multiple of 4, compute the remaining filter taps */
00515     tapCnt = numTaps % 0x4U;
00516 
00517     while (tapCnt > 0U)
00518     {
00519       /* Read coefficients */
00520       c0 = *pb++;
00521 
00522       /* Fetch 1 state variable */
00523       x0 = *px++;
00524 
00525       /* Perform the multiply-accumulate */
00526       sum0 += x0 * c0;
00527 
00528       /* Decrement the loop counter */
00529       tapCnt--;
00530     }
00531 
00532     /* Advance the state pointer by the decimation factor
00533      * to process the next group of decimation factor number samples */
00534     pState = pState + S->M;
00535 
00536     /* Store filter output, smlad returns the values in 2.14 format */
00537     /* so downsacle by 15 to get output in 1.15 */
00538     *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16));
00539 
00540     /* Decrement the loop counter */
00541     blkCntN3--;
00542   }
00543 
00544   /* Processing is complete.
00545    ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
00546    ** This prepares the state buffer for the next function call. */
00547 
00548   /* Points to the start of the state buffer */
00549   pStateCurnt = S->pState;
00550 
00551   i = (numTaps - 1U) >> 2U;
00552 
00553   /* copy data */
00554   while (i > 0U)
00555   {
00556     *pStateCurnt++ = *pState++;
00557     *pStateCurnt++ = *pState++;
00558     *pStateCurnt++ = *pState++;
00559     *pStateCurnt++ = *pState++;
00560 
00561     /* Decrement the loop counter */
00562     i--;
00563   }
00564 
00565   i = (numTaps - 1U) % 0x04U;
00566 
00567   /* copy data */
00568   while (i > 0U)
00569   {
00570     *pStateCurnt++ = *pState++;
00571 
00572     /* Decrement the loop counter */
00573     i--;
00574   }
00575 }
00576 
00577 
00578 #endif  /*  #ifndef UNALIGNED_SUPPORT_DISABLE   */
00579 
00580 #else
00581 
00582 
00583 void arm_fir_decimate_q15(
00584   const arm_fir_decimate_instance_q15 * S,
00585   q15_t * pSrc,
00586   q15_t * pDst,
00587   uint32_t blockSize)
00588 {
00589   q15_t *pState = S->pState;                     /* State pointer */
00590   q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
00591   q15_t *pStateCurnt;                            /* Points to the current sample of the state */
00592   q15_t *px;                                     /* Temporary pointer for state buffer */
00593   q15_t *pb;                                     /* Temporary pointer coefficient buffer */
00594   q31_t x0, c0;                                  /* Temporary variables to hold state and coefficient values */
00595   q63_t sum0;                                    /* Accumulators */
00596   uint32_t numTaps = S->numTaps;                 /* Number of taps */
00597   uint32_t i, blkCnt, tapCnt, outBlockSize = blockSize / S->M;  /* Loop counters */
00598 
00599 
00600 
00601 /* Run the below code for Cortex-M0 */
00602 
00603   /* S->pState buffer contains previous frame (numTaps - 1) samples */
00604   /* pStateCurnt points to the location where the new input data should be written */
00605   pStateCurnt = S->pState + (numTaps - 1U);
00606 
00607   /* Total number of output samples to be computed */
00608   blkCnt = outBlockSize;
00609 
00610   while (blkCnt > 0U)
00611   {
00612     /* Copy decimation factor number of new input samples into the state buffer */
00613     i = S->M;
00614 
00615     do
00616     {
00617       *pStateCurnt++ = *pSrc++;
00618 
00619     } while (--i);
00620 
00621     /*Set sum to zero */
00622     sum0 = 0;
00623 
00624     /* Initialize state pointer */
00625     px = pState;
00626 
00627     /* Initialize coeff pointer */
00628     pb = pCoeffs;
00629 
00630     tapCnt = numTaps;
00631 
00632     while (tapCnt > 0U)
00633     {
00634       /* Read coefficients */
00635       c0 = *pb++;
00636 
00637       /* Fetch 1 state variable */
00638       x0 = *px++;
00639 
00640       /* Perform the multiply-accumulate */
00641       sum0 += (q31_t) x0 *c0;
00642 
00643       /* Decrement the loop counter */
00644       tapCnt--;
00645     }
00646 
00647     /* Advance the state pointer by the decimation factor
00648      * to process the next group of decimation factor number samples */
00649     pState = pState + S->M;
00650 
00651     /*Store filter output , smlad will return the values in 2.14 format */
00652     /* so downsacle by 15 to get output in 1.15 */
00653     *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16));
00654 
00655     /* Decrement the loop counter */
00656     blkCnt--;
00657   }
00658 
00659   /* Processing is complete.
00660    ** Now copy the last numTaps - 1 samples to the start of the state buffer.
00661    ** This prepares the state buffer for the next function call. */
00662 
00663   /* Points to the start of the state buffer */
00664   pStateCurnt = S->pState;
00665 
00666   i = numTaps - 1U;
00667 
00668   /* copy data */
00669   while (i > 0U)
00670   {
00671     *pStateCurnt++ = *pState++;
00672 
00673     /* Decrement the loop counter */
00674     i--;
00675   }
00676 
00677 
00678 }
00679 #endif /*   #if defined (ARM_MATH_DSP) */
00680 
00681 
00682 /**
00683  * @} end of FIR_decimate group
00684  */
00685