CMSIS DSP library

Dependents:   KL25Z_FFT_Demo Hat_Board_v5_1 KL25Z_FFT_Demo_tony KL25Z_FFT_Demo_tony ... more

Fork of mbed-dsp by mbed official

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_fir_sparse_q7.c Source File

arm_fir_sparse_q7.c

00001 /* ----------------------------------------------------------------------    
00002 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.    
00003 *    
00004 * $Date:        17. January 2013
00005 * $Revision:    V1.4.1
00006 *    
00007 * Project:      CMSIS DSP Library    
00008 * Title:        arm_fir_sparse_q7.c    
00009 *    
00010 * Description:  Q7 sparse FIR filter processing function.   
00011 *    
00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
00013 *  
00014 * Redistribution and use in source and binary forms, with or without 
00015 * modification, are permitted provided that the following conditions
00016 * are met:
00017 *   - Redistributions of source code must retain the above copyright
00018 *     notice, this list of conditions and the following disclaimer.
00019 *   - Redistributions in binary form must reproduce the above copyright
00020 *     notice, this list of conditions and the following disclaimer in
00021 *     the documentation and/or other materials provided with the 
00022 *     distribution.
00023 *   - Neither the name of ARM LIMITED nor the names of its contributors
00024 *     may be used to endorse or promote products derived from this
00025 *     software without specific prior written permission.
00026 *
00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00038 * POSSIBILITY OF SUCH DAMAGE.   
00039 * ------------------------------------------------------------------- */
00040 #include "arm_math.h"
00041 
00042 
00043 /**    
00044  * @ingroup groupFilters    
00045  */
00046 
00047 /**    
00048  * @addtogroup FIR_Sparse    
00049  * @{    
00050  */
00051 
00052 
00053 /**   
00054  * @brief Processing function for the Q7 sparse FIR filter.   
00055  * @param[in]  *S           points to an instance of the Q7 sparse FIR structure.   
00056  * @param[in]  *pSrc        points to the block of input data.   
00057  * @param[out] *pDst        points to the block of output data   
00058  * @param[in]  *pScratchIn  points to a temporary buffer of size blockSize.   
00059  * @param[in]  *pScratchOut points to a temporary buffer of size blockSize.   
00060  * @param[in]  blockSize    number of input samples to process per call.   
00061  * @return none.   
00062  *    
00063  * <b>Scaling and Overflow Behavior:</b>    
00064  * \par    
00065  * The function is implemented using a 32-bit internal accumulator.    
00066  * Both coefficients and state variables are represented in 1.7 format and multiplications yield a 2.14 result.    
00067  * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.    
00068  * There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.    
00069  * The accumulator is then converted to 18.7 format by discarding the low 7 bits.   
00070  * Finally, the result is truncated to 1.7 format.   
00071  */
00072 
00073 void arm_fir_sparse_q7(
00074   arm_fir_sparse_instance_q7 * S,
00075   q7_t * pSrc,
00076   q7_t * pDst,
00077   q7_t * pScratchIn,
00078   q31_t * pScratchOut,
00079   uint32_t blockSize)
00080 {
00081 
00082   q7_t *pState = S->pState;                      /* State pointer */
00083   q7_t *pCoeffs = S->pCoeffs;                    /* Coefficient pointer */
00084   q7_t *px;                                      /* Scratch buffer pointer */
00085   q7_t *py = pState;                             /* Temporary pointers for state buffer */
00086   q7_t *pb = pScratchIn;                         /* Temporary pointers for scratch buffer */
00087   q7_t *pOut = pDst;                             /* Destination pointer */
00088   int32_t *pTapDelay = S->pTapDelay;             /* Pointer to the array containing offset of the non-zero tap values. */
00089   uint32_t delaySize = S->maxDelay + blockSize;  /* state length */
00090   uint16_t numTaps = S->numTaps;                 /* Filter order */
00091   int32_t readIndex;                             /* Read index of the state buffer */
00092   uint32_t tapCnt, blkCnt;                       /* loop counters */
00093   q7_t coeff = *pCoeffs++;                       /* Read the coefficient value */
00094   q31_t *pScr2 = pScratchOut;                    /* Working pointer for scratch buffer of output values */
00095   q31_t in;
00096 
00097 
00098 #ifndef ARM_MATH_CM0_FAMILY
00099 
00100   /* Run the below code for Cortex-M4 and Cortex-M3 */
00101 
00102   q7_t in1, in2, in3, in4;
00103 
00104   /* BlockSize of Input samples are copied into the state buffer */
00105   /* StateIndex points to the starting position to write in the state buffer */
00106   arm_circularWrite_q7(py, (int32_t) delaySize, &S->stateIndex, 1, pSrc, 1,
00107                        blockSize);
00108 
00109   /* Loop over the number of taps. */
00110   tapCnt = numTaps;
00111 
00112   /* Read Index, from where the state buffer should be read, is calculated. */
00113   readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
00114 
00115   /* Wraparound of readIndex */
00116   if(readIndex < 0)
00117   {
00118     readIndex += (int32_t) delaySize;
00119   }
00120 
00121   /* Working pointer for state buffer is updated */
00122   py = pState;
00123 
00124   /* blockSize samples are read from the state buffer */
00125   arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb,
00126                       (int32_t) blockSize, 1, blockSize);
00127 
00128   /* Working pointer for the scratch buffer of state values */
00129   px = pb;
00130 
00131   /* Working pointer for scratch buffer of output values */
00132   pScratchOut = pScr2;
00133 
00134   /* Loop over the blockSize. Unroll by a factor of 4.    
00135    * Compute 4 multiplications at a time. */
00136   blkCnt = blockSize >> 2;
00137 
00138   while(blkCnt > 0u)
00139   {
00140     /* Perform multiplication and store in the scratch buffer */
00141     *pScratchOut++ = ((q31_t) * px++ * coeff);
00142     *pScratchOut++ = ((q31_t) * px++ * coeff);
00143     *pScratchOut++ = ((q31_t) * px++ * coeff);
00144     *pScratchOut++ = ((q31_t) * px++ * coeff);
00145 
00146     /* Decrement the loop counter */
00147     blkCnt--;
00148   }
00149 
00150   /* If the blockSize is not a multiple of 4,    
00151    * compute the remaining samples */
00152   blkCnt = blockSize % 0x4u;
00153 
00154   while(blkCnt > 0u)
00155   {
00156     /* Perform multiplication and store in the scratch buffer */
00157     *pScratchOut++ = ((q31_t) * px++ * coeff);
00158 
00159     /* Decrement the loop counter */
00160     blkCnt--;
00161   }
00162 
00163   /* Load the coefficient value and    
00164    * increment the coefficient buffer for the next set of state values */
00165   coeff = *pCoeffs++;
00166 
00167   /* Read Index, from where the state buffer should be read, is calculated. */
00168   readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
00169 
00170   /* Wraparound of readIndex */
00171   if(readIndex < 0)
00172   {
00173     readIndex += (int32_t) delaySize;
00174   }
00175 
00176   /* Loop over the number of taps. */
00177   tapCnt = (uint32_t) numTaps - 1u;
00178 
00179   while(tapCnt > 0u)
00180   {
00181     /* Working pointer for state buffer is updated */
00182     py = pState;
00183 
00184     /* blockSize samples are read from the state buffer */
00185     arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb,
00186                         (int32_t) blockSize, 1, blockSize);
00187 
00188     /* Working pointer for the scratch buffer of state values */
00189     px = pb;
00190 
00191     /* Working pointer for scratch buffer of output values */
00192     pScratchOut = pScr2;
00193 
00194     /* Loop over the blockSize. Unroll by a factor of 4.    
00195      * Compute 4 MACS at a time. */
00196     blkCnt = blockSize >> 2;
00197 
00198     while(blkCnt > 0u)
00199     {
00200       /* Perform Multiply-Accumulate */
00201       in = *pScratchOut + ((q31_t) * px++ * coeff);
00202       *pScratchOut++ = in;
00203       in = *pScratchOut + ((q31_t) * px++ * coeff);
00204       *pScratchOut++ = in;
00205       in = *pScratchOut + ((q31_t) * px++ * coeff);
00206       *pScratchOut++ = in;
00207       in = *pScratchOut + ((q31_t) * px++ * coeff);
00208       *pScratchOut++ = in;
00209 
00210       /* Decrement the loop counter */
00211       blkCnt--;
00212     }
00213 
00214     /* If the blockSize is not a multiple of 4,    
00215      * compute the remaining samples */
00216     blkCnt = blockSize % 0x4u;
00217 
00218     while(blkCnt > 0u)
00219     {
00220       /* Perform Multiply-Accumulate */
00221       in = *pScratchOut + ((q31_t) * px++ * coeff);
00222       *pScratchOut++ = in;
00223 
00224       /* Decrement the loop counter */
00225       blkCnt--;
00226     }
00227 
00228     /* Load the coefficient value and    
00229      * increment the coefficient buffer for the next set of state values */
00230     coeff = *pCoeffs++;
00231 
00232     /* Read Index, from where the state buffer should be read, is calculated. */
00233     readIndex = ((int32_t) S->stateIndex -
00234                  (int32_t) blockSize) - *pTapDelay++;
00235 
00236     /* Wraparound of readIndex */
00237     if(readIndex < 0)
00238     {
00239       readIndex += (int32_t) delaySize;
00240     }
00241 
00242     /* Decrement the tap loop counter */
00243     tapCnt--;
00244   }
00245 
00246   /* All the output values are in pScratchOut buffer.    
00247      Convert them into 1.15 format, saturate and store in the destination buffer. */
00248   /* Loop over the blockSize. */
00249   blkCnt = blockSize >> 2;
00250 
00251   while(blkCnt > 0u)
00252   {
00253     in1 = (q7_t) __SSAT(*pScr2++ >> 7, 8);
00254     in2 = (q7_t) __SSAT(*pScr2++ >> 7, 8);
00255     in3 = (q7_t) __SSAT(*pScr2++ >> 7, 8);
00256     in4 = (q7_t) __SSAT(*pScr2++ >> 7, 8);
00257 
00258     *__SIMD32(pOut)++ = __PACKq7(in1, in2, in3, in4);
00259 
00260     /* Decrement the blockSize loop counter */
00261     blkCnt--;
00262   }
00263 
00264   /* If the blockSize is not a multiple of 4,    
00265      remaining samples are processed in the below loop */
00266   blkCnt = blockSize % 0x4u;
00267 
00268   while(blkCnt > 0u)
00269   {
00270     *pOut++ = (q7_t) __SSAT(*pScr2++ >> 7, 8);
00271 
00272     /* Decrement the blockSize loop counter */
00273     blkCnt--;
00274   }
00275 
00276 #else
00277 
00278   /* Run the below code for Cortex-M0 */
00279 
00280   /* BlockSize of Input samples are copied into the state buffer */
00281   /* StateIndex points to the starting position to write in the state buffer */
00282   arm_circularWrite_q7(py, (int32_t) delaySize, &S->stateIndex, 1, pSrc, 1,
00283                        blockSize);
00284 
00285   /* Loop over the number of taps. */
00286   tapCnt = numTaps;
00287 
00288   /* Read Index, from where the state buffer should be read, is calculated. */
00289   readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
00290 
00291   /* Wraparound of readIndex */
00292   if(readIndex < 0)
00293   {
00294     readIndex += (int32_t) delaySize;
00295   }
00296 
00297   /* Working pointer for state buffer is updated */
00298   py = pState;
00299 
00300   /* blockSize samples are read from the state buffer */
00301   arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb,
00302                       (int32_t) blockSize, 1, blockSize);
00303 
00304   /* Working pointer for the scratch buffer of state values */
00305   px = pb;
00306 
00307   /* Working pointer for scratch buffer of output values */
00308   pScratchOut = pScr2;
00309 
00310   /* Loop over the blockSize */
00311   blkCnt = blockSize;
00312 
00313   while(blkCnt > 0u)
00314   {
00315     /* Perform multiplication and store in the scratch buffer */
00316     *pScratchOut++ = ((q31_t) * px++ * coeff);
00317 
00318     /* Decrement the loop counter */
00319     blkCnt--;
00320   }
00321 
00322   /* Load the coefficient value and           
00323    * increment the coefficient buffer for the next set of state values */
00324   coeff = *pCoeffs++;
00325 
00326   /* Read Index, from where the state buffer should be read, is calculated. */
00327   readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
00328 
00329   /* Wraparound of readIndex */
00330   if(readIndex < 0)
00331   {
00332     readIndex += (int32_t) delaySize;
00333   }
00334 
00335   /* Loop over the number of taps. */
00336   tapCnt = (uint32_t) numTaps - 1u;
00337 
00338   while(tapCnt > 0u)
00339   {
00340     /* Working pointer for state buffer is updated */
00341     py = pState;
00342 
00343     /* blockSize samples are read from the state buffer */
00344     arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb,
00345                         (int32_t) blockSize, 1, blockSize);
00346 
00347     /* Working pointer for the scratch buffer of state values */
00348     px = pb;
00349 
00350     /* Working pointer for scratch buffer of output values */
00351     pScratchOut = pScr2;
00352 
00353     /* Loop over the blockSize */
00354     blkCnt = blockSize;
00355 
00356     while(blkCnt > 0u)
00357     {
00358       /* Perform Multiply-Accumulate */
00359       in = *pScratchOut + ((q31_t) * px++ * coeff);
00360       *pScratchOut++ = in;
00361 
00362       /* Decrement the loop counter */
00363       blkCnt--;
00364     }
00365 
00366     /* Load the coefficient value and           
00367      * increment the coefficient buffer for the next set of state values */
00368     coeff = *pCoeffs++;
00369 
00370     /* Read Index, from where the state buffer should be read, is calculated. */
00371     readIndex =
00372       ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
00373 
00374     /* Wraparound of readIndex */
00375     if(readIndex < 0)
00376     {
00377       readIndex += (int32_t) delaySize;
00378     }
00379 
00380     /* Decrement the tap loop counter */
00381     tapCnt--;
00382   }
00383 
00384   /* All the output values are in pScratchOut buffer.       
00385      Convert them into 1.15 format, saturate and store in the destination buffer. */
00386   /* Loop over the blockSize. */
00387   blkCnt = blockSize;
00388 
00389   while(blkCnt > 0u)
00390   {
00391     *pOut++ = (q7_t) __SSAT(*pScr2++ >> 7, 8);
00392 
00393     /* Decrement the blockSize loop counter */
00394     blkCnt--;
00395   }
00396 
00397 #endif /*   #ifndef ARM_MATH_CM0_FAMILY */
00398 
00399 }
00400 
00401 /**    
00402  * @} end of FIR_Sparse group    
00403  */