CMSIS DSP library

Dependents:   performance_timer Surfboard_ gps2rtty Capstone ... more

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_fir_sparse_q7.c Source File

arm_fir_sparse_q7.c

00001 /* ----------------------------------------------------------------------    
00002 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
00003 *    
00004 * $Date:        19. March 2015
00005 * $Revision:    V.1.4.5
00006 *    
00007 * Project:      CMSIS DSP Library    
00008 * Title:        arm_fir_sparse_q7.c    
00009 *    
00010 * Description:  Q7 sparse FIR filter processing function.   
00011 *    
00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
00013 *  
00014 * Redistribution and use in source and binary forms, with or without 
00015 * modification, are permitted provided that the following conditions
00016 * are met:
00017 *   - Redistributions of source code must retain the above copyright
00018 *     notice, this list of conditions and the following disclaimer.
00019 *   - Redistributions in binary form must reproduce the above copyright
00020 *     notice, this list of conditions and the following disclaimer in
00021 *     the documentation and/or other materials provided with the 
00022 *     distribution.
00023 *   - Neither the name of ARM LIMITED nor the names of its contributors
00024 *     may be used to endorse or promote products derived from this
00025 *     software without specific prior written permission.
00026 *
00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00038 * POSSIBILITY OF SUCH DAMAGE.   
00039 * ------------------------------------------------------------------- */
00040 #include "arm_math.h"
00041 
00042 
00043 /**    
00044  * @ingroup groupFilters    
00045  */
00046 
00047 /**    
00048  * @addtogroup FIR_Sparse    
00049  * @{    
00050  */
00051 
00052 
00053 /**   
00054  * @brief Processing function for the Q7 sparse FIR filter.   
00055  * @param[in]  *S           points to an instance of the Q7 sparse FIR structure.   
00056  * @param[in]  *pSrc        points to the block of input data.   
00057  * @param[out] *pDst        points to the block of output data   
00058  * @param[in]  *pScratchIn  points to a temporary buffer of size blockSize.   
00059  * @param[in]  *pScratchOut points to a temporary buffer of size blockSize.   
00060  * @param[in]  blockSize    number of input samples to process per call.   
00061  * @return none.   
00062  *    
00063  * <b>Scaling and Overflow Behavior:</b>    
00064  * \par    
00065  * The function is implemented using a 32-bit internal accumulator.    
00066  * Both coefficients and state variables are represented in 1.7 format and multiplications yield a 2.14 result.    
00067  * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.    
00068  * There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.    
00069  * The accumulator is then converted to 18.7 format by discarding the low 7 bits.   
00070  * Finally, the result is truncated to 1.7 format.   
00071  */
00072 
00073 void arm_fir_sparse_q7(
00074   arm_fir_sparse_instance_q7 * S,
00075   q7_t * pSrc,
00076   q7_t * pDst,
00077   q7_t * pScratchIn,
00078   q31_t * pScratchOut,
00079   uint32_t blockSize)
00080 {
00081 
00082   q7_t *pState = S->pState;                      /* State pointer */
00083   q7_t *pCoeffs = S->pCoeffs;                    /* Coefficient pointer */
00084   q7_t *px;                                      /* Scratch buffer pointer */
00085   q7_t *py = pState;                             /* Temporary pointers for state buffer */
00086   q7_t *pb = pScratchIn;                         /* Temporary pointers for scratch buffer */
00087   q7_t *pOut = pDst;                             /* Destination pointer */
00088   int32_t *pTapDelay = S->pTapDelay;             /* Pointer to the array containing offset of the non-zero tap values. */
00089   uint32_t delaySize = S->maxDelay + blockSize;  /* state length */
00090   uint16_t numTaps = S->numTaps;                 /* Filter order */
00091   int32_t readIndex;                             /* Read index of the state buffer */
00092   uint32_t tapCnt, blkCnt;                       /* loop counters */
00093   q7_t coeff = *pCoeffs++;                       /* Read the coefficient value */
00094   q31_t *pScr2 = pScratchOut;                    /* Working pointer for scratch buffer of output values */
00095   q31_t in;
00096 
00097 
00098 #ifndef ARM_MATH_CM0_FAMILY
00099 
00100   /* Run the below code for Cortex-M4 and Cortex-M3 */
00101 
00102   q7_t in1, in2, in3, in4;
00103 
00104   /* BlockSize of Input samples are copied into the state buffer */
00105   /* StateIndex points to the starting position to write in the state buffer */
00106   arm_circularWrite_q7(py, (int32_t) delaySize, &S->stateIndex, 1, pSrc, 1,
00107                        blockSize);
00108 
00109   /* Loop over the number of taps. */
00110   tapCnt = numTaps;
00111 
00112   /* Read Index, from where the state buffer should be read, is calculated. */
00113   readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
00114 
00115   /* Wraparound of readIndex */
00116   if(readIndex < 0)
00117   {
00118     readIndex += (int32_t) delaySize;
00119   }
00120 
00121   /* Working pointer for state buffer is updated */
00122   py = pState;
00123 
00124   /* blockSize samples are read from the state buffer */
00125   arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb,
00126                       (int32_t) blockSize, 1, blockSize);
00127 
00128   /* Working pointer for the scratch buffer of state values */
00129   px = pb;
00130 
00131   /* Working pointer for scratch buffer of output values */
00132   pScratchOut = pScr2;
00133 
00134   /* Loop over the blockSize. Unroll by a factor of 4.    
00135    * Compute 4 multiplications at a time. */
00136   blkCnt = blockSize >> 2;
00137 
00138   while(blkCnt > 0u)
00139   {
00140     /* Perform multiplication and store in the scratch buffer */
00141     *pScratchOut++ = ((q31_t) * px++ * coeff);
00142     *pScratchOut++ = ((q31_t) * px++ * coeff);
00143     *pScratchOut++ = ((q31_t) * px++ * coeff);
00144     *pScratchOut++ = ((q31_t) * px++ * coeff);
00145 
00146     /* Decrement the loop counter */
00147     blkCnt--;
00148   }
00149 
00150   /* If the blockSize is not a multiple of 4,    
00151    * compute the remaining samples */
00152   blkCnt = blockSize % 0x4u;
00153 
00154   while(blkCnt > 0u)
00155   {
00156     /* Perform multiplication and store in the scratch buffer */
00157     *pScratchOut++ = ((q31_t) * px++ * coeff);
00158 
00159     /* Decrement the loop counter */
00160     blkCnt--;
00161   }
00162 
00163   /* Load the coefficient value and    
00164    * increment the coefficient buffer for the next set of state values */
00165   coeff = *pCoeffs++;
00166 
00167   /* Read Index, from where the state buffer should be read, is calculated. */
00168   readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
00169 
00170   /* Wraparound of readIndex */
00171   if(readIndex < 0)
00172   {
00173     readIndex += (int32_t) delaySize;
00174   }
00175 
00176   /* Loop over the number of taps. */
00177   tapCnt = (uint32_t) numTaps - 2u;
00178 
00179   while(tapCnt > 0u)
00180   {
00181     /* Working pointer for state buffer is updated */
00182     py = pState;
00183 
00184     /* blockSize samples are read from the state buffer */
00185     arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb,
00186                         (int32_t) blockSize, 1, blockSize);
00187 
00188     /* Working pointer for the scratch buffer of state values */
00189     px = pb;
00190 
00191     /* Working pointer for scratch buffer of output values */
00192     pScratchOut = pScr2;
00193 
00194     /* Loop over the blockSize. Unroll by a factor of 4.    
00195      * Compute 4 MACS at a time. */
00196     blkCnt = blockSize >> 2;
00197 
00198     while(blkCnt > 0u)
00199     {
00200       /* Perform Multiply-Accumulate */
00201       in = *pScratchOut + ((q31_t) * px++ * coeff);
00202       *pScratchOut++ = in;
00203       in = *pScratchOut + ((q31_t) * px++ * coeff);
00204       *pScratchOut++ = in;
00205       in = *pScratchOut + ((q31_t) * px++ * coeff);
00206       *pScratchOut++ = in;
00207       in = *pScratchOut + ((q31_t) * px++ * coeff);
00208       *pScratchOut++ = in;
00209 
00210       /* Decrement the loop counter */
00211       blkCnt--;
00212     }
00213 
00214     /* If the blockSize is not a multiple of 4,    
00215      * compute the remaining samples */
00216     blkCnt = blockSize % 0x4u;
00217 
00218     while(blkCnt > 0u)
00219     {
00220       /* Perform Multiply-Accumulate */
00221       in = *pScratchOut + ((q31_t) * px++ * coeff);
00222       *pScratchOut++ = in;
00223 
00224       /* Decrement the loop counter */
00225       blkCnt--;
00226     }
00227 
00228     /* Load the coefficient value and    
00229      * increment the coefficient buffer for the next set of state values */
00230     coeff = *pCoeffs++;
00231 
00232     /* Read Index, from where the state buffer should be read, is calculated. */
00233     readIndex = ((int32_t) S->stateIndex -
00234                  (int32_t) blockSize) - *pTapDelay++;
00235 
00236     /* Wraparound of readIndex */
00237     if(readIndex < 0)
00238     {
00239       readIndex += (int32_t) delaySize;
00240     }
00241 
00242     /* Decrement the tap loop counter */
00243     tapCnt--;
00244   }
00245     
00246     /* Compute last tap without the final read of pTapDelay */  
00247     
00248     /* Working pointer for state buffer is updated */
00249     py = pState;
00250 
00251     /* blockSize samples are read from the state buffer */
00252     arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb,
00253                                             (int32_t) blockSize, 1, blockSize);
00254 
00255     /* Working pointer for the scratch buffer of state values */
00256     px = pb;
00257 
00258     /* Working pointer for scratch buffer of output values */
00259     pScratchOut = pScr2;
00260 
00261     /* Loop over the blockSize. Unroll by a factor of 4.    
00262      * Compute 4 MACS at a time. */
00263     blkCnt = blockSize >> 2;
00264 
00265     while(blkCnt > 0u)
00266     {
00267         /* Perform Multiply-Accumulate */
00268         in = *pScratchOut + ((q31_t) * px++ * coeff);
00269         *pScratchOut++ = in;
00270         in = *pScratchOut + ((q31_t) * px++ * coeff);
00271         *pScratchOut++ = in;
00272         in = *pScratchOut + ((q31_t) * px++ * coeff);
00273         *pScratchOut++ = in;
00274         in = *pScratchOut + ((q31_t) * px++ * coeff);
00275         *pScratchOut++ = in;
00276 
00277         /* Decrement the loop counter */
00278         blkCnt--;
00279     }
00280 
00281     /* If the blockSize is not a multiple of 4,    
00282      * compute the remaining samples */
00283     blkCnt = blockSize % 0x4u;
00284 
00285     while(blkCnt > 0u)
00286     {
00287         /* Perform Multiply-Accumulate */
00288         in = *pScratchOut + ((q31_t) * px++ * coeff);
00289         *pScratchOut++ = in;
00290 
00291         /* Decrement the loop counter */
00292         blkCnt--;
00293     }
00294 
00295   /* All the output values are in pScratchOut buffer.    
00296      Convert them into 1.15 format, saturate and store in the destination buffer. */
00297   /* Loop over the blockSize. */
00298   blkCnt = blockSize >> 2;
00299 
00300   while(blkCnt > 0u)
00301   {
00302     in1 = (q7_t) __SSAT(*pScr2++ >> 7, 8);
00303     in2 = (q7_t) __SSAT(*pScr2++ >> 7, 8);
00304     in3 = (q7_t) __SSAT(*pScr2++ >> 7, 8);
00305     in4 = (q7_t) __SSAT(*pScr2++ >> 7, 8);
00306 
00307     *__SIMD32(pOut)++ = __PACKq7(in1, in2, in3, in4);
00308 
00309     /* Decrement the blockSize loop counter */
00310     blkCnt--;
00311   }
00312 
00313   /* If the blockSize is not a multiple of 4,    
00314      remaining samples are processed in the below loop */
00315   blkCnt = blockSize % 0x4u;
00316 
00317   while(blkCnt > 0u)
00318   {
00319     *pOut++ = (q7_t) __SSAT(*pScr2++ >> 7, 8);
00320 
00321     /* Decrement the blockSize loop counter */
00322     blkCnt--;
00323   }
00324 
00325 #else
00326 
00327   /* Run the below code for Cortex-M0 */
00328 
00329   /* BlockSize of Input samples are copied into the state buffer */
00330   /* StateIndex points to the starting position to write in the state buffer */
00331   arm_circularWrite_q7(py, (int32_t) delaySize, &S->stateIndex, 1, pSrc, 1,
00332                        blockSize);
00333 
00334   /* Loop over the number of taps. */
00335   tapCnt = numTaps;
00336 
00337   /* Read Index, from where the state buffer should be read, is calculated. */
00338   readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
00339 
00340   /* Wraparound of readIndex */
00341   if(readIndex < 0)
00342   {
00343     readIndex += (int32_t) delaySize;
00344   }
00345 
00346   /* Working pointer for state buffer is updated */
00347   py = pState;
00348 
00349   /* blockSize samples are read from the state buffer */
00350   arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb,
00351                       (int32_t) blockSize, 1, blockSize);
00352 
00353   /* Working pointer for the scratch buffer of state values */
00354   px = pb;
00355 
00356   /* Working pointer for scratch buffer of output values */
00357   pScratchOut = pScr2;
00358 
00359   /* Loop over the blockSize */
00360   blkCnt = blockSize;
00361 
00362   while(blkCnt > 0u)
00363   {
00364     /* Perform multiplication and store in the scratch buffer */
00365     *pScratchOut++ = ((q31_t) * px++ * coeff);
00366 
00367     /* Decrement the loop counter */
00368     blkCnt--;
00369   }
00370 
00371   /* Load the coefficient value and           
00372    * increment the coefficient buffer for the next set of state values */
00373   coeff = *pCoeffs++;
00374 
00375   /* Read Index, from where the state buffer should be read, is calculated. */
00376   readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
00377 
00378   /* Wraparound of readIndex */
00379   if(readIndex < 0)
00380   {
00381     readIndex += (int32_t) delaySize;
00382   }
00383 
00384   /* Loop over the number of taps. */
00385   tapCnt = (uint32_t) numTaps - 2u;
00386 
00387   while(tapCnt > 0u)
00388   {
00389     /* Working pointer for state buffer is updated */
00390     py = pState;
00391 
00392     /* blockSize samples are read from the state buffer */
00393     arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb,
00394                         (int32_t) blockSize, 1, blockSize);
00395 
00396     /* Working pointer for the scratch buffer of state values */
00397     px = pb;
00398 
00399     /* Working pointer for scratch buffer of output values */
00400     pScratchOut = pScr2;
00401 
00402     /* Loop over the blockSize */
00403     blkCnt = blockSize;
00404 
00405     while(blkCnt > 0u)
00406     {
00407       /* Perform Multiply-Accumulate */
00408       in = *pScratchOut + ((q31_t) * px++ * coeff);
00409       *pScratchOut++ = in;
00410 
00411       /* Decrement the loop counter */
00412       blkCnt--;
00413     }
00414 
00415     /* Load the coefficient value and           
00416      * increment the coefficient buffer for the next set of state values */
00417     coeff = *pCoeffs++;
00418 
00419     /* Read Index, from where the state buffer should be read, is calculated. */
00420     readIndex =
00421       ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
00422 
00423     /* Wraparound of readIndex */
00424     if(readIndex < 0)
00425     {
00426       readIndex += (int32_t) delaySize;
00427     }
00428 
00429     /* Decrement the tap loop counter */
00430     tapCnt--;
00431   }
00432     
00433     /* Compute last tap without the final read of pTapDelay */  
00434     
00435     /* Working pointer for state buffer is updated */
00436     py = pState;
00437 
00438     /* blockSize samples are read from the state buffer */
00439     arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb,
00440                                             (int32_t) blockSize, 1, blockSize);
00441 
00442     /* Working pointer for the scratch buffer of state values */
00443     px = pb;
00444 
00445     /* Working pointer for scratch buffer of output values */
00446     pScratchOut = pScr2;
00447 
00448     /* Loop over the blockSize */
00449     blkCnt = blockSize;
00450 
00451     while(blkCnt > 0u)
00452     {
00453         /* Perform Multiply-Accumulate */
00454         in = *pScratchOut + ((q31_t) * px++ * coeff);
00455         *pScratchOut++ = in;
00456 
00457         /* Decrement the loop counter */
00458         blkCnt--;
00459     }
00460 
00461   /* All the output values are in pScratchOut buffer.       
00462      Convert them into 1.15 format, saturate and store in the destination buffer. */
00463   /* Loop over the blockSize. */
00464   blkCnt = blockSize;
00465 
00466   while(blkCnt > 0u)
00467   {
00468     *pOut++ = (q7_t) __SSAT(*pScr2++ >> 7, 8);
00469 
00470     /* Decrement the blockSize loop counter */
00471     blkCnt--;
00472   }
00473 
00474 #endif /*   #ifndef ARM_MATH_CM0_FAMILY */
00475 
00476 }
00477 
00478 /**    
00479  * @} end of FIR_Sparse group    
00480  */