CMSIS DSP library

Dependents:   KL25Z_FFT_Demo Hat_Board_v5_1 KL25Z_FFT_Demo_tony KL25Z_FFT_Demo_tony ... more

Fork of mbed-dsp by mbed official

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_fir_decimate_fast_q15.c Source File

arm_fir_decimate_fast_q15.c

00001 /* ----------------------------------------------------------------------    
00002 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.    
00003 *    
00004 * $Date:        17. January 2013
00005 * $Revision:    V1.4.1
00006 *    
00007 * Project:      CMSIS DSP Library    
00008 * Title:        arm_fir_decimate_fast_q15.c    
00009 *    
00010 * Description:  Fast Q15 FIR Decimator.    
00011 *    
00012 * Target Processor: Cortex-M4/Cortex-M3
00013 *  
00014 * Redistribution and use in source and binary forms, with or without 
00015 * modification, are permitted provided that the following conditions
00016 * are met:
00017 *   - Redistributions of source code must retain the above copyright
00018 *     notice, this list of conditions and the following disclaimer.
00019 *   - Redistributions in binary form must reproduce the above copyright
00020 *     notice, this list of conditions and the following disclaimer in
00021 *     the documentation and/or other materials provided with the 
00022 *     distribution.
00023 *   - Neither the name of ARM LIMITED nor the names of its contributors
00024 *     may be used to endorse or promote products derived from this
00025 *     software without specific prior written permission.
00026 *
00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00038 * POSSIBILITY OF SUCH DAMAGE.    
00039 * -------------------------------------------------------------------- */
00040 
00041 #include "arm_math.h"
00042 
00043 /**    
00044  * @ingroup groupFilters    
00045  */
00046 
00047 /**    
00048  * @addtogroup FIR_decimate    
00049  * @{    
00050  */
00051 
00052 /**    
00053  * @brief Processing function for the Q15 FIR decimator (fast variant) for Cortex-M3 and Cortex-M4.    
00054  * @param[in] *S points to an instance of the Q15 FIR decimator structure.    
00055  * @param[in] *pSrc points to the block of input data.    
00056  * @param[out] *pDst points to the block of output data    
00057  * @param[in] blockSize number of input samples to process per call.    
00058  * @return none    
00059  *    
00060  * \par Restrictions   
00061  *  If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE   
00062  *  In this case input, output, state buffers should be aligned by 32-bit   
00063  *    
00064  * <b>Scaling and Overflow Behavior:</b>    
00065  * \par    
00066  * This fast version uses a 32-bit accumulator with 2.30 format.    
00067  * The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit.    
00068  * Thus, if the accumulator result overflows it wraps around and distorts the result.    
00069  * In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits (log2 is read as log to the base 2).    
00070  * The 2.30 accumulator is then truncated to 2.15 format and saturated to yield the 1.15 result.    
00071  *    
00072  * \par    
00073  * Refer to the function <code>arm_fir_decimate_q15()</code> for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion.    
00074  * Both the slow and the fast versions use the same instance structure.    
00075  * Use the function <code>arm_fir_decimate_init_q15()</code> to initialize the filter structure.    
00076  */
00077 
00078 #ifndef UNALIGNED_SUPPORT_DISABLE
00079 
00080 void arm_fir_decimate_fast_q15(
00081   const arm_fir_decimate_instance_q15 * S,
00082   q15_t * pSrc,
00083   q15_t * pDst,
00084   uint32_t blockSize)
00085 {
00086   q15_t *pState = S->pState;                     /* State pointer */
00087   q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
00088   q15_t *pStateCurnt;                            /* Points to the current sample of the state */
00089   q15_t *px;                                     /* Temporary pointer for state buffer */
00090   q15_t *pb;                                     /* Temporary pointer coefficient buffer */
00091   q31_t x0, x1, c0, c1;                          /* Temporary variables to hold state and coefficient values */
00092   q31_t sum0;                                    /* Accumulators */
00093   q31_t acc0, acc1;
00094   q15_t *px0, *px1;
00095   uint32_t blkCntN3;
00096   uint32_t numTaps = S->numTaps;                 /* Number of taps */
00097   uint32_t i, blkCnt, tapCnt, outBlockSize = blockSize / S->M;  /* Loop counters */
00098 
00099 
00100   /* S->pState buffer contains previous frame (numTaps - 1) samples */
00101   /* pStateCurnt points to the location where the new input data should be written */
00102   pStateCurnt = S->pState + (numTaps - 1u);
00103 
00104 
00105   /* Total number of output samples to be computed */
00106   blkCnt = outBlockSize / 2;
00107   blkCntN3 = outBlockSize - (2 * blkCnt);
00108 
00109 
00110   while(blkCnt > 0u)
00111   {
00112     /* Copy decimation factor number of new input samples into the state buffer */
00113     i = 2 * S->M;
00114 
00115     do
00116     {
00117       *pStateCurnt++ = *pSrc++;
00118 
00119     } while(--i);
00120 
00121     /* Set accumulator to zero */
00122     acc0 = 0;
00123     acc1 = 0;
00124 
00125     /* Initialize state pointer */
00126     px0 = pState;
00127 
00128     px1 = pState + S->M;
00129 
00130 
00131     /* Initialize coeff pointer */
00132     pb = pCoeffs;
00133 
00134     /* Loop unrolling.  Process 4 taps at a time. */
00135     tapCnt = numTaps >> 2;
00136 
00137     /* Loop over the number of taps.  Unroll by a factor of 4.       
00138      ** Repeat until we've computed numTaps-4 coefficients. */
00139     while(tapCnt > 0u)
00140     {
00141       /* Read the Read b[numTaps-1] and b[numTaps-2]  coefficients */
00142       c0 = *__SIMD32(pb)++;
00143 
00144       /* Read x[n-numTaps-1] and x[n-numTaps-2]sample */
00145       x0 = *__SIMD32(px0)++;
00146 
00147       x1 = *__SIMD32(px1)++;
00148 
00149       /* Perform the multiply-accumulate */
00150       acc0 = __SMLAD(x0, c0, acc0);
00151 
00152       acc1 = __SMLAD(x1, c0, acc1);
00153 
00154       /* Read the b[numTaps-3] and b[numTaps-4] coefficient */
00155       c0 = *__SIMD32(pb)++;
00156 
00157       /* Read x[n-numTaps-2] and x[n-numTaps-3] sample */
00158       x0 = *__SIMD32(px0)++;
00159 
00160       x1 = *__SIMD32(px1)++;
00161 
00162       /* Perform the multiply-accumulate */
00163       acc0 = __SMLAD(x0, c0, acc0);
00164 
00165       acc1 = __SMLAD(x1, c0, acc1);
00166 
00167       /* Decrement the loop counter */
00168       tapCnt--;
00169     }
00170 
00171     /* If the filter length is not a multiple of 4, compute the remaining filter taps */
00172     tapCnt = numTaps % 0x4u;
00173 
00174     while(tapCnt > 0u)
00175     {
00176       /* Read coefficients */
00177       c0 = *pb++;
00178 
00179       /* Fetch 1 state variable */
00180       x0 = *px0++;
00181 
00182       x1 = *px1++;
00183 
00184       /* Perform the multiply-accumulate */
00185       acc0 = __SMLAD(x0, c0, acc0);
00186       acc1 = __SMLAD(x1, c0, acc1);
00187 
00188       /* Decrement the loop counter */
00189       tapCnt--;
00190     }
00191 
00192     /* Advance the state pointer by the decimation factor       
00193      * to process the next group of decimation factor number samples */
00194     pState = pState + S->M * 2;
00195 
00196     /* Store filter output, smlad returns the values in 2.14 format */
00197     /* so downsacle by 15 to get output in 1.15 */
00198     *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
00199     *pDst++ = (q15_t) (__SSAT((acc1 >> 15), 16));
00200 
00201     /* Decrement the loop counter */
00202     blkCnt--;
00203   }
00204 
00205 
00206 
00207   while(blkCntN3 > 0u)
00208   {
00209     /* Copy decimation factor number of new input samples into the state buffer */
00210     i = S->M;
00211 
00212     do
00213     {
00214       *pStateCurnt++ = *pSrc++;
00215 
00216     } while(--i);
00217 
00218     /*Set sum to zero */
00219     sum0 = 0;
00220 
00221     /* Initialize state pointer */
00222     px = pState;
00223 
00224     /* Initialize coeff pointer */
00225     pb = pCoeffs;
00226 
00227     /* Loop unrolling.  Process 4 taps at a time. */
00228     tapCnt = numTaps >> 2;
00229 
00230     /* Loop over the number of taps.  Unroll by a factor of 4.       
00231      ** Repeat until we've computed numTaps-4 coefficients. */
00232     while(tapCnt > 0u)
00233     {
00234       /* Read the Read b[numTaps-1] and b[numTaps-2]  coefficients */
00235       c0 = *__SIMD32(pb)++;
00236 
00237       /* Read x[n-numTaps-1] and x[n-numTaps-2]sample */
00238       x0 = *__SIMD32(px)++;
00239 
00240       /* Read the b[numTaps-3] and b[numTaps-4] coefficient */
00241       c1 = *__SIMD32(pb)++;
00242 
00243       /* Perform the multiply-accumulate */
00244       sum0 = __SMLAD(x0, c0, sum0);
00245 
00246       /* Read x[n-numTaps-2] and x[n-numTaps-3] sample */
00247       x0 = *__SIMD32(px)++;
00248 
00249       /* Perform the multiply-accumulate */
00250       sum0 = __SMLAD(x0, c1, sum0);
00251 
00252       /* Decrement the loop counter */
00253       tapCnt--;
00254     }
00255 
00256     /* If the filter length is not a multiple of 4, compute the remaining filter taps */
00257     tapCnt = numTaps % 0x4u;
00258 
00259     while(tapCnt > 0u)
00260     {
00261       /* Read coefficients */
00262       c0 = *pb++;
00263 
00264       /* Fetch 1 state variable */
00265       x0 = *px++;
00266 
00267       /* Perform the multiply-accumulate */
00268       sum0 = __SMLAD(x0, c0, sum0);
00269 
00270       /* Decrement the loop counter */
00271       tapCnt--;
00272     }
00273 
00274     /* Advance the state pointer by the decimation factor       
00275      * to process the next group of decimation factor number samples */
00276     pState = pState + S->M;
00277 
00278     /* Store filter output, smlad returns the values in 2.14 format */
00279     /* so downsacle by 15 to get output in 1.15 */
00280     *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16));
00281 
00282     /* Decrement the loop counter */
00283     blkCntN3--;
00284   }
00285 
00286   /* Processing is complete.       
00287    ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.       
00288    ** This prepares the state buffer for the next function call. */
00289 
00290   /* Points to the start of the state buffer */
00291   pStateCurnt = S->pState;
00292 
00293   i = (numTaps - 1u) >> 2u;
00294 
00295   /* copy data */
00296   while(i > 0u)
00297   {
00298     *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++;
00299     *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++;
00300 
00301     /* Decrement the loop counter */
00302     i--;
00303   }
00304 
00305   i = (numTaps - 1u) % 0x04u;
00306 
00307   /* copy data */
00308   while(i > 0u)
00309   {
00310     *pStateCurnt++ = *pState++;
00311 
00312     /* Decrement the loop counter */
00313     i--;
00314   }
00315 }
00316 
00317 #else
00318 
00319 
00320 void arm_fir_decimate_fast_q15(
00321   const arm_fir_decimate_instance_q15 * S,
00322   q15_t * pSrc,
00323   q15_t * pDst,
00324   uint32_t blockSize)
00325 {
00326   q15_t *pState = S->pState;                     /* State pointer */
00327   q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
00328   q15_t *pStateCurnt;                            /* Points to the current sample of the state */
00329   q15_t *px;                                     /* Temporary pointer for state buffer */
00330   q15_t *pb;                                     /* Temporary pointer coefficient buffer */
00331   q15_t x0, x1, c0;                              /* Temporary variables to hold state and coefficient values */
00332   q31_t sum0;                                    /* Accumulators */
00333   q31_t acc0, acc1;
00334   q15_t *px0, *px1;
00335   uint32_t blkCntN3;
00336   uint32_t numTaps = S->numTaps;                 /* Number of taps */
00337   uint32_t i, blkCnt, tapCnt, outBlockSize = blockSize / S->M;  /* Loop counters */
00338 
00339 
00340   /* S->pState buffer contains previous frame (numTaps - 1) samples */
00341   /* pStateCurnt points to the location where the new input data should be written */
00342   pStateCurnt = S->pState + (numTaps - 1u);
00343 
00344 
00345   /* Total number of output samples to be computed */
00346   blkCnt = outBlockSize / 2;
00347   blkCntN3 = outBlockSize - (2 * blkCnt);
00348 
00349   while(blkCnt > 0u)
00350   {
00351     /* Copy decimation factor number of new input samples into the state buffer */
00352     i = 2 * S->M;
00353 
00354     do
00355     {
00356       *pStateCurnt++ = *pSrc++;
00357 
00358     } while(--i);
00359 
00360     /* Set accumulator to zero */
00361     acc0 = 0;
00362     acc1 = 0;
00363 
00364     /* Initialize state pointer */
00365     px0 = pState;
00366 
00367     px1 = pState + S->M;
00368 
00369 
00370     /* Initialize coeff pointer */
00371     pb = pCoeffs;
00372 
00373     /* Loop unrolling.  Process 4 taps at a time. */
00374     tapCnt = numTaps >> 2;
00375 
00376     /* Loop over the number of taps.  Unroll by a factor of 4.       
00377      ** Repeat until we've computed numTaps-4 coefficients. */
00378     while(tapCnt > 0u)
00379     {
00380       /* Read the Read b[numTaps-1] coefficients */
00381       c0 = *pb++;
00382 
00383       /* Read x[n-numTaps-1] for sample 0 and for sample 1 */
00384       x0 = *px0++;
00385       x1 = *px1++;
00386 
00387       /* Perform the multiply-accumulate */
00388       acc0 += x0 * c0;
00389       acc1 += x1 * c0;
00390 
00391       /* Read the b[numTaps-2] coefficient */
00392       c0 = *pb++;
00393 
00394       /* Read x[n-numTaps-2] for sample 0 and sample 1 */
00395       x0 = *px0++;
00396       x1 = *px1++;
00397 
00398       /* Perform the multiply-accumulate */
00399       acc0 += x0 * c0;
00400       acc1 += x1 * c0;
00401 
00402       /* Read the b[numTaps-3]  coefficients */
00403       c0 = *pb++;
00404 
00405       /* Read x[n-numTaps-3] for sample 0 and sample 1 */
00406       x0 = *px0++;
00407       x1 = *px1++;
00408 
00409       /* Perform the multiply-accumulate */
00410       acc0 += x0 * c0;
00411       acc1 += x1 * c0;
00412 
00413       /* Read the b[numTaps-4] coefficient */
00414       c0 = *pb++;
00415 
00416       /* Read x[n-numTaps-4] for sample 0 and sample 1 */
00417       x0 = *px0++;
00418       x1 = *px1++;
00419 
00420       /* Perform the multiply-accumulate */
00421       acc0 += x0 * c0;
00422       acc1 += x1 * c0;
00423 
00424       /* Decrement the loop counter */
00425       tapCnt--;
00426     }
00427 
00428     /* If the filter length is not a multiple of 4, compute the remaining filter taps */
00429     tapCnt = numTaps % 0x4u;
00430 
00431     while(tapCnt > 0u)
00432     {
00433       /* Read coefficients */
00434       c0 = *pb++;
00435 
00436       /* Fetch 1 state variable */
00437       x0 = *px0++;
00438       x1 = *px1++;
00439 
00440       /* Perform the multiply-accumulate */
00441       acc0 += x0 * c0;
00442       acc1 += x1 * c0;
00443 
00444       /* Decrement the loop counter */
00445       tapCnt--;
00446     }
00447 
00448     /* Advance the state pointer by the decimation factor       
00449      * to process the next group of decimation factor number samples */
00450     pState = pState + S->M * 2;
00451 
00452     /* Store filter output, smlad returns the values in 2.14 format */
00453     /* so downsacle by 15 to get output in 1.15 */
00454 
00455     *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
00456     *pDst++ = (q15_t) (__SSAT((acc1 >> 15), 16));
00457 
00458 
00459     /* Decrement the loop counter */
00460     blkCnt--;
00461   }
00462 
00463   while(blkCntN3 > 0u)
00464   {
00465     /* Copy decimation factor number of new input samples into the state buffer */
00466     i = S->M;
00467 
00468     do
00469     {
00470       *pStateCurnt++ = *pSrc++;
00471 
00472     } while(--i);
00473 
00474     /*Set sum to zero */
00475     sum0 = 0;
00476 
00477     /* Initialize state pointer */
00478     px = pState;
00479 
00480     /* Initialize coeff pointer */
00481     pb = pCoeffs;
00482 
00483     /* Loop unrolling.  Process 4 taps at a time. */
00484     tapCnt = numTaps >> 2;
00485 
00486     /* Loop over the number of taps.  Unroll by a factor of 4.       
00487      ** Repeat until we've computed numTaps-4 coefficients. */
00488     while(tapCnt > 0u)
00489     {
00490       /* Read the Read b[numTaps-1] coefficients */
00491       c0 = *pb++;
00492 
00493       /* Read x[n-numTaps-1] and sample */
00494       x0 = *px++;
00495 
00496       /* Perform the multiply-accumulate */
00497       sum0 += x0 * c0;
00498 
00499       /* Read the b[numTaps-2] coefficient */
00500       c0 = *pb++;
00501 
00502       /* Read x[n-numTaps-2] and  sample */
00503       x0 = *px++;
00504 
00505       /* Perform the multiply-accumulate */
00506       sum0 += x0 * c0;
00507 
00508       /* Read the b[numTaps-3]  coefficients */
00509       c0 = *pb++;
00510 
00511       /* Read x[n-numTaps-3] sample */
00512       x0 = *px++;
00513 
00514       /* Perform the multiply-accumulate */
00515       sum0 += x0 * c0;
00516 
00517       /* Read the b[numTaps-4] coefficient */
00518       c0 = *pb++;
00519 
00520       /* Read x[n-numTaps-4] sample */
00521       x0 = *px++;
00522 
00523       /* Perform the multiply-accumulate */
00524       sum0 += x0 * c0;
00525 
00526       /* Decrement the loop counter */
00527       tapCnt--;
00528     }
00529 
00530     /* If the filter length is not a multiple of 4, compute the remaining filter taps */
00531     tapCnt = numTaps % 0x4u;
00532 
00533     while(tapCnt > 0u)
00534     {
00535       /* Read coefficients */
00536       c0 = *pb++;
00537 
00538       /* Fetch 1 state variable */
00539       x0 = *px++;
00540 
00541       /* Perform the multiply-accumulate */
00542       sum0 += x0 * c0;
00543 
00544       /* Decrement the loop counter */
00545       tapCnt--;
00546     }
00547 
00548     /* Advance the state pointer by the decimation factor       
00549      * to process the next group of decimation factor number samples */
00550     pState = pState + S->M;
00551 
00552     /* Store filter output, smlad returns the values in 2.14 format */
00553     /* so downsacle by 15 to get output in 1.15 */
00554     *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16));
00555 
00556     /* Decrement the loop counter */
00557     blkCntN3--;
00558   }
00559 
00560   /* Processing is complete.       
00561    ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.       
00562    ** This prepares the state buffer for the next function call. */
00563 
00564   /* Points to the start of the state buffer */
00565   pStateCurnt = S->pState;
00566 
00567   i = (numTaps - 1u) >> 2u;
00568 
00569   /* copy data */
00570   while(i > 0u)
00571   {
00572     *pStateCurnt++ = *pState++;
00573     *pStateCurnt++ = *pState++;
00574     *pStateCurnt++ = *pState++;
00575     *pStateCurnt++ = *pState++;
00576 
00577     /* Decrement the loop counter */
00578     i--;
00579   }
00580 
00581   i = (numTaps - 1u) % 0x04u;
00582 
00583   /* copy data */
00584   while(i > 0u)
00585   {
00586     *pStateCurnt++ = *pState++;
00587 
00588     /* Decrement the loop counter */
00589     i--;
00590   }
00591 }
00592 
00593 
00594 #endif  /*  #ifndef UNALIGNED_SUPPORT_DISABLE   */
00595 
00596 /**    
00597  * @} end of FIR_decimate group    
00598  */