CMSIS DSP Library from CMSIS 2.0. See http://www.onarm.com/cmsis/ for full details

Dependents:   K22F_DSP_Matrix_least_square BNO055-ELEC3810 1BNO055 ECE4180Project--Slave2 ... more

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_fir_sparse_q15.c Source File

arm_fir_sparse_q15.c

00001 /* ----------------------------------------------------------------------  
00002 * Copyright (C) 2010 ARM Limited. All rights reserved.  
00003 *  
00004 * $Date:        29. November 2010  
00005 * $Revision:    V1.0.3  
00006 *  
00007 * Project:      CMSIS DSP Library  
00008 * Title:        arm_fir_sparse_q15.c  
00009 *  
00010 * Description:  Q15 sparse FIR filter processing function. 
00011 *  
00012 * Target Processor: Cortex-M4/Cortex-M3
00013 *  
00014 * Version 1.0.3 2010/11/29 
00015 *    Re-organized the CMSIS folders and updated documentation.  
00016 *   
00017 * Version 1.0.2 2010/11/11  
00018 *    Documentation updated.   
00019 *  
00020 * Version 1.0.1 2010/10/05   
00021 *    Production release and review comments incorporated.  
00022 *  
00023 * Version 1.0.0 2010/09/20   
00024 *    Production release and review comments incorporated  
00025 *  
00026 * Version 0.0.7  2010/06/10   
00027 *    Misra-C changes done  
00028 * ------------------------------------------------------------------- */ 
00029 #include "arm_math.h" 
00030  
00031 /**  
00032  * @addtogroup FIR_Sparse  
00033  * @{  
00034  */ 
00035  
00036 /** 
00037  * @brief Processing function for the Q15 sparse FIR filter. 
00038  * @param[in]  *S           points to an instance of the Q15 sparse FIR structure. 
00039  * @param[in]  *pSrc        points to the block of input data. 
00040  * @param[out] *pDst        points to the block of output data 
00041  * @param[in]  *pScratchIn  points to a temporary buffer of size blockSize. 
00042  * @param[in]  *pScratchOut points to a temporary buffer of size blockSize. 
00043  * @param[in]  blockSize    number of input samples to process per call. 
00044  * @return none. 
00045  *  
00046  * <b>Scaling and Overflow Behavior:</b>  
00047  * \par  
00048  * The function is implemented using an internal 32-bit accumulator. 
00049  * The 1.15 x 1.15 multiplications yield a 2.30 result and these are added to a 2.30 accumulator. 
00050  * Thus the full precision of the multiplications is maintained but there is only a single guard bit in the accumulator. 
00051  * If the accumulator result overflows it will wrap around rather than saturate. 
00052  * After all multiply-accumulates are performed, the 2.30 accumulator is truncated to 2.15 format and then saturated to 1.15 format.  
00053  * In order to avoid overflows the input signal or coefficients must be scaled down by log2(numTaps) bits. 
00054  */ 
00055  
00056  
00057 void arm_fir_sparse_q15( 
00058   arm_fir_sparse_instance_q15 * S, 
00059   q15_t * pSrc, 
00060   q15_t * pDst, 
00061   q15_t * pScratchIn, 
00062   q31_t * pScratchOut, 
00063   uint32_t blockSize) 
00064 { 
00065  
00066   q15_t *pState = S->pState;                     /* State pointer */ 
00067   q15_t *pIn = (q15_t *) pSrc;                   /* Working pointer for input */ 
00068   q15_t *pOut = pDst;                            /* Working pointer for output */ 
00069   q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */ 
00070   q15_t *px;                                     /* Temporary pointers for scratch buffer */ 
00071   q15_t *pb = pScratchIn;                        /* Temporary pointers for scratch buffer */ 
00072   q15_t *py = pState;                            /* Temporary pointers for state buffer */ 
00073   int32_t *pTapDelay = S->pTapDelay;             /* Pointer to the array containing offset of the non-zero tap values. */ 
00074   uint32_t delaySize = S->maxDelay + blockSize;  /* state length */ 
00075   uint16_t numTaps = S->numTaps;                 /* Filter order */ 
00076   int32_t readIndex;                             /* Read index of the state buffer */ 
00077   uint32_t tapCnt, blkCnt;                       /* loop counters */ 
00078   q15_t coeff = *pCoeffs++;                      /* Read the first coefficient value */ 
00079   q31_t *pScr2 = pScratchOut;                    /* Working pointer for pScratchOut */ 
00080   q31_t in1, in2;                                /* Temporary variables */ 
00081  
00082  
00083  
00084   /* BlockSize of Input samples are copied into the state buffer */ 
00085   /* StateIndex points to the starting position to write in the state buffer */ 
00086   arm_circularWrite_q15(py, delaySize, &S->stateIndex, 1, pIn, 1, blockSize); 
00087  
00088   /* Loop over the number of taps. */ 
00089   tapCnt = numTaps; 
00090  
00091   /* Read Index, from where the state buffer should be read, is calculated. */ 
00092   readIndex = (S->stateIndex - blockSize) - *pTapDelay++; 
00093  
00094   /* Wraparound of readIndex */ 
00095   if(readIndex < 0) 
00096   { 
00097     readIndex += (int32_t) delaySize; 
00098   } 
00099  
00100   /* Working pointer for state buffer is updated */ 
00101   py = pState; 
00102  
00103   /* blockSize samples are read from the state buffer */ 
00104   arm_circularRead_q15(py, delaySize, &readIndex, 1, 
00105                        pb, pb, blockSize, 1, blockSize); 
00106  
00107   /* Working pointer for the scratch buffer of state values */ 
00108   px = pb; 
00109  
00110   /* Working pointer for scratch buffer of output values */ 
00111   pScratchOut = pScr2; 
00112  
00113   /* Loop over the blockSize. Unroll by a factor of 4.  
00114    * Compute 4 multiplications at a time. */ 
00115   blkCnt = blockSize >> 2; 
00116  
00117   while(blkCnt > 0u) 
00118   { 
00119     /* Perform multiplication and store in the scratch buffer */ 
00120     *pScratchOut++ = ((q31_t) * px++ * coeff); 
00121     *pScratchOut++ = ((q31_t) * px++ * coeff); 
00122     *pScratchOut++ = ((q31_t) * px++ * coeff); 
00123     *pScratchOut++ = ((q31_t) * px++ * coeff); 
00124  
00125     /* Decrement the loop counter */ 
00126     blkCnt--; 
00127   } 
00128  
00129   /* If the blockSize is not a multiple of 4,  
00130    * compute the remaining samples */ 
00131   blkCnt = blockSize % 0x4u; 
00132  
00133   while(blkCnt > 0u) 
00134   { 
00135     /* Perform multiplication and store in the scratch buffer */ 
00136     *pScratchOut++ = ((q31_t) * px++ * coeff); 
00137  
00138     /* Decrement the loop counter */ 
00139     blkCnt--; 
00140   } 
00141  
00142   /* Load the coefficient value and  
00143    * increment the coefficient buffer for the next set of state values */ 
00144   coeff = *pCoeffs++; 
00145  
00146   /* Read Index, from where the state buffer should be read, is calculated. */ 
00147   readIndex = (S->stateIndex - blockSize) - *pTapDelay++; 
00148  
00149   /* Wraparound of readIndex */ 
00150   if(readIndex < 0) 
00151   { 
00152     readIndex += (int32_t) delaySize; 
00153   } 
00154  
00155   /* Loop over the number of taps. */ 
00156   tapCnt = (uint32_t) numTaps - 1u; 
00157  
00158   while(tapCnt > 0u) 
00159   { 
00160     /* Working pointer for state buffer is updated */ 
00161     py = pState; 
00162  
00163     /* blockSize samples are read from the state buffer */ 
00164     arm_circularRead_q15(py, delaySize, &readIndex, 1, 
00165                          pb, pb, blockSize, 1, blockSize); 
00166  
00167     /* Working pointer for the scratch buffer of state values */ 
00168     px = pb; 
00169  
00170     /* Working pointer for scratch buffer of output values */ 
00171     pScratchOut = pScr2; 
00172  
00173     /* Loop over the blockSize. Unroll by a factor of 4.  
00174      * Compute 4 MACS at a time. */ 
00175     blkCnt = blockSize >> 2; 
00176  
00177     while(blkCnt > 0u) 
00178     { 
00179       /* Perform Multiply-Accumulate */ 
00180       *pScratchOut++ += (q31_t) * px++ * coeff; 
00181       *pScratchOut++ += (q31_t) * px++ * coeff; 
00182       *pScratchOut++ += (q31_t) * px++ * coeff; 
00183       *pScratchOut++ += (q31_t) * px++ * coeff; 
00184  
00185       /* Decrement the loop counter */ 
00186       blkCnt--; 
00187     } 
00188  
00189     /* If the blockSize is not a multiple of 4,  
00190      * compute the remaining samples */ 
00191     blkCnt = blockSize % 0x4u; 
00192  
00193     while(blkCnt > 0u) 
00194     { 
00195       /* Perform Multiply-Accumulate */ 
00196       *pScratchOut++ += (q31_t) * px++ * coeff; 
00197  
00198       /* Decrement the loop counter */ 
00199       blkCnt--; 
00200     } 
00201  
00202     /* Load the coefficient value and  
00203      * increment the coefficient buffer for the next set of state values */ 
00204     coeff = *pCoeffs++; 
00205  
00206     /* Read Index, from where the state buffer should be read, is calculated. */ 
00207     readIndex = (S->stateIndex - blockSize) - *pTapDelay++; 
00208  
00209     /* Wraparound of readIndex */ 
00210     if(readIndex < 0) 
00211     { 
00212       readIndex += (int32_t) delaySize; 
00213     } 
00214  
00215     /* Decrement the tap loop counter */ 
00216     tapCnt--; 
00217   } 
00218  
00219   /* All the output values are in pScratchOut buffer.  
00220      Convert them into 1.15 format, saturate and store in the destination buffer. */ 
00221   /* Loop over the blockSize. */ 
00222   blkCnt = blockSize >> 2; 
00223  
00224   while(blkCnt > 0u) 
00225   { 
00226     in1 = *pScr2++; 
00227     in2 = *pScr2++; 
00228     *__SIMD32(pOut)++ = 
00229       __PKHBT((q15_t) __SSAT(in1 >> 15, 16), (q15_t) __SSAT(in2 >> 15, 16), 
00230               16); 
00231  
00232     in1 = *pScr2++; 
00233     in2 = *pScr2++; 
00234     *__SIMD32(pOut)++ = 
00235       __PKHBT((q15_t) __SSAT(in1 >> 15, 16), (q15_t) __SSAT(in2 >> 15, 16), 
00236               16); 
00237  
00238     blkCnt--; 
00239  
00240   } 
00241  
00242   /* If the blockSize is not a multiple of 4,  
00243      remaining samples are processed in the below loop */ 
00244   blkCnt = blockSize % 0x4u; 
00245  
00246   while(blkCnt > 0u) 
00247   { 
00248     *pOut++ = (q15_t) __SSAT(*pScr2++ >> 15, 16); 
00249     blkCnt--; 
00250   } 
00251 } 
00252  
00253 /**  
00254  * @} end of FIR_Sparse group  
00255  */