CMSIS DSP Library from CMSIS 2.0. See http://www.onarm.com/cmsis/ for full details

Dependents:   K22F_DSP_Matrix_least_square BNO055-ELEC3810 1BNO055 ECE4180Project--Slave2 ... more

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_fir_sparse_q31.c Source File

arm_fir_sparse_q31.c

00001 /* ----------------------------------------------------------------------  
00002 * Copyright (C) 2010 ARM Limited. All rights reserved.  
00003 *  
00004 * $Date:        29. November 2010  
00005 * $Revision:    V1.0.3  
00006 *  
00007 * Project:      CMSIS DSP Library  
00008 * Title:        arm_fir_sparse_q31.c  
00009 *  
00010 * Description:  Q31 sparse FIR filter processing function. 
00011 *  
00012 * Target Processor: Cortex-M4/Cortex-M3
00013 *  
00014 * Version 1.0.3 2010/11/29 
00015 *    Re-organized the CMSIS folders and updated documentation.  
00016 *   
00017 * Version 1.0.2 2010/11/11  
00018 *    Documentation updated.   
00019 *  
00020 * Version 1.0.1 2010/10/05   
00021 *    Production release and review comments incorporated.  
00022 *  
00023 * Version 1.0.0 2010/09/20   
00024 *    Production release and review comments incorporated  
00025 *  
00026 * Version 0.0.7  2010/06/10   
00027 *    Misra-C changes done  
00028 * ------------------------------------------------------------------- */ 
00029 #include "arm_math.h" 
00030  
00031  
00032 /**  
00033  * @addtogroup FIR_Sparse  
00034  * @{  
00035  */ 
00036  
00037 /** 
00038  * @brief Processing function for the Q31 sparse FIR filter. 
00039  * @param[in]  *S          points to an instance of the Q31 sparse FIR structure. 
00040  * @param[in]  *pSrc       points to the block of input data. 
00041  * @param[out] *pDst       points to the block of output data 
00042  * @param[in]  *pScratchIn points to a temporary buffer of size blockSize. 
00043  * @param[in]  blockSize   number of input samples to process per call. 
00044  * @return none. 
00045  *  
00046  * <b>Scaling and Overflow Behavior:</b>  
00047  * \par  
00048  * The function is implemented using an internal 32-bit accumulator. 
00049  * The 1.31 x 1.31 multiplications are truncated to 2.30 format. 
00050  * This leads to loss of precision on the intermediate multiplications and provides only a single guard bit.  
00051  * If the accumulator result overflows, it wraps around rather than saturate. 
00052  * In order to avoid overflows the input signal or coefficients must be scaled down by log2(numTaps) bits. 
00053  */ 
00054  
00055 void arm_fir_sparse_q31( 
00056   arm_fir_sparse_instance_q31 * S, 
00057   q31_t * pSrc, 
00058   q31_t * pDst, 
00059   q31_t * pScratchIn, 
00060   uint32_t blockSize) 
00061 { 
00062  
00063   q31_t *pState = S->pState;                     /* State pointer */ 
00064   q31_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */ 
00065   q31_t *px;                                     /* Scratch buffer pointer */ 
00066   q31_t *py = pState;                            /* Temporary pointers for state buffer */ 
00067   q31_t *pb = pScratchIn;                        /* Temporary pointers for scratch buffer */ 
00068   q31_t *pOut;                                   /* Destination pointer */ 
00069   q63_t out;                                     /* Temporary output variable */ 
00070   int32_t *pTapDelay = S->pTapDelay;             /* Pointer to the array containing offset of the non-zero tap values. */ 
00071   uint32_t delaySize = S->maxDelay + blockSize;  /* state length */ 
00072   uint16_t numTaps = S->numTaps;                 /* Filter order */ 
00073   int32_t readIndex;                             /* Read index of the state buffer */ 
00074   uint32_t tapCnt, blkCnt;                       /* loop counters */ 
00075   q31_t coeff = *pCoeffs++;                      /* Read the first coefficient value */ 
00076   q31_t in; 
00077  
00078  
00079   /* BlockSize of Input samples are copied into the state buffer */ 
00080   /* StateIndex points to the starting position to write in the state buffer */ 
00081   arm_circularWrite_f32((int32_t *) py, delaySize, &S->stateIndex, 1, 
00082                         (int32_t *) pSrc, 1, blockSize); 
00083  
00084   /* Read Index, from where the state buffer should be read, is calculated. */ 
00085   readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++; 
00086  
00087   /* Wraparound of readIndex */ 
00088   if(readIndex < 0) 
00089   { 
00090     readIndex += (int32_t) delaySize; 
00091   } 
00092  
00093   /* Working pointer for state buffer is updated */ 
00094   py = pState; 
00095  
00096   /* blockSize samples are read from the state buffer */ 
00097   arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1, 
00098                        (int32_t *) pb, (int32_t *) pb, blockSize, 1, 
00099                        blockSize); 
00100  
00101   /* Working pointer for the scratch buffer of state values */ 
00102   px = pb; 
00103  
00104   /* Working pointer for scratch buffer of output values */ 
00105   pOut = pDst; 
00106  
00107   /* Loop over the blockSize. Unroll by a factor of 4.  
00108    * Compute 4 Multiplications at a time. */ 
00109   blkCnt = blockSize >> 2; 
00110  
00111   while(blkCnt > 0u) 
00112   { 
00113     /* Perform Multiplications and store in the destination buffer */ 
00114     *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32); 
00115     *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32); 
00116     *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32); 
00117     *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32); 
00118  
00119     /* Decrement the loop counter */ 
00120     blkCnt--; 
00121   } 
00122  
00123   /* If the blockSize is not a multiple of 4,  
00124    * compute the remaining samples */ 
00125   blkCnt = blockSize % 0x4u; 
00126  
00127   while(blkCnt > 0u) 
00128   { 
00129     /* Perform Multiplications and store in the destination buffer */ 
00130     *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32); 
00131  
00132     /* Decrement the loop counter */ 
00133     blkCnt--; 
00134   } 
00135  
00136   /* Load the coefficient value and  
00137    * increment the coefficient buffer for the next set of state values */ 
00138   coeff = *pCoeffs++; 
00139  
00140   /* Read Index, from where the state buffer should be read, is calculated. */ 
00141   readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++; 
00142  
00143   /* Wraparound of readIndex */ 
00144   if(readIndex < 0) 
00145   { 
00146     readIndex += (int32_t) delaySize; 
00147   } 
00148  
00149   /* Loop over the number of taps. */ 
00150   tapCnt = (uint32_t) numTaps - 1u; 
00151  
00152   while(tapCnt > 0u) 
00153   { 
00154     /* Working pointer for state buffer is updated */ 
00155     py = pState; 
00156  
00157     /* blockSize samples are read from the state buffer */ 
00158     arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1, 
00159                          (int32_t *) pb, (int32_t *) pb, blockSize, 1, 
00160                          blockSize); 
00161  
00162     /* Working pointer for the scratch buffer of state values */ 
00163     px = pb; 
00164  
00165     /* Working pointer for scratch buffer of output values */ 
00166     pOut = pDst; 
00167  
00168     /* Loop over the blockSize. Unroll by a factor of 4.  
00169      * Compute 4 MACS at a time. */ 
00170     blkCnt = blockSize >> 2; 
00171  
00172     while(blkCnt > 0u) 
00173     { 
00174       out = *pOut; 
00175       out += ((q63_t) * px++ * coeff) >> 32; 
00176       *pOut++ = (q31_t) (out); 
00177  
00178       out = *pOut; 
00179       out += ((q63_t) * px++ * coeff) >> 32; 
00180       *pOut++ = (q31_t) (out); 
00181  
00182       out = *pOut; 
00183       out += ((q63_t) * px++ * coeff) >> 32; 
00184       *pOut++ = (q31_t) (out); 
00185  
00186       out = *pOut; 
00187       out += ((q63_t) * px++ * coeff) >> 32; 
00188       *pOut++ = (q31_t) (out); 
00189  
00190       /* Decrement the loop counter */ 
00191       blkCnt--; 
00192     } 
00193  
00194     /* If the blockSize is not a multiple of 4,  
00195      * compute the remaining samples */ 
00196     blkCnt = blockSize % 0x4u; 
00197  
00198     while(blkCnt > 0u) 
00199     { 
00200       /* Perform Multiply-Accumulate */ 
00201       out = *pOut; 
00202       out += ((q63_t) * px++ * coeff) >> 32; 
00203       *pOut++ = (q31_t) (out); 
00204  
00205       /* Decrement the loop counter */ 
00206       blkCnt--; 
00207     } 
00208  
00209     /* Load the coefficient value and  
00210      * increment the coefficient buffer for the next set of state values */ 
00211     coeff = *pCoeffs++; 
00212  
00213     /* Read Index, from where the state buffer should be read, is calculated. */ 
00214     readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++; 
00215  
00216     /* Wraparound of readIndex */ 
00217     if(readIndex < 0) 
00218     { 
00219       readIndex += (int32_t) delaySize; 
00220     } 
00221  
00222     /* Decrement the tap loop counter */ 
00223     tapCnt--; 
00224   } 
00225  
00226   /* Working output pointer is updated */ 
00227   pOut = pDst; 
00228  
00229   /* Output is converted into 1.15 format. */ 
00230   /* Loop over the blockSize. Unroll by a factor of 4.  
00231    * process 4 output samples at a time. */ 
00232   blkCnt = blockSize >> 2; 
00233  
00234   while(blkCnt > 0u) 
00235   { 
00236     in = *pOut << 1; 
00237     *pOut++ = in; 
00238     in = *pOut << 1; 
00239     *pOut++ = in; 
00240     in = *pOut << 1; 
00241     *pOut++ = in; 
00242     in = *pOut << 1; 
00243     *pOut++ = in; 
00244  
00245     /* Decrement the loop counter */ 
00246     blkCnt--; 
00247   } 
00248  
00249   /* If the blockSize is not a multiple of 4,  
00250    * process the remaining output samples */ 
00251   blkCnt = blockSize % 0x4u; 
00252  
00253   while(blkCnt > 0u) 
00254   { 
00255     in = *pOut << 1; 
00256     *pOut++ = in; 
00257  
00258     /* Decrement the loop counter */ 
00259     blkCnt--; 
00260   } 
00261 } 
00262  
00263 /**  
00264  * @} end of FIR_Sparse group  
00265  */