CMSIS DSP Library from CMSIS 2.0. See http://www.onarm.com/cmsis/ for full details
Dependents: K22F_DSP_Matrix_least_square BNO055-ELEC3810 1BNO055 ECE4180Project--Slave2 ... more
arm_fir_sparse_q15.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 29. November 2010 00005 * $Revision: V1.0.3 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_fir_sparse_q15.c 00009 * 00010 * Description: Q15 sparse FIR filter processing function. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3 00013 * 00014 * Version 1.0.3 2010/11/29 00015 * Re-organized the CMSIS folders and updated documentation. 00016 * 00017 * Version 1.0.2 2010/11/11 00018 * Documentation updated. 00019 * 00020 * Version 1.0.1 2010/10/05 00021 * Production release and review comments incorporated. 00022 * 00023 * Version 1.0.0 2010/09/20 00024 * Production release and review comments incorporated 00025 * 00026 * Version 0.0.7 2010/06/10 00027 * Misra-C changes done 00028 * ------------------------------------------------------------------- */ 00029 #include "arm_math.h" 00030 00031 /** 00032 * @addtogroup FIR_Sparse 00033 * @{ 00034 */ 00035 00036 /** 00037 * @brief Processing function for the Q15 sparse FIR filter. 00038 * @param[in] *S points to an instance of the Q15 sparse FIR structure. 00039 * @param[in] *pSrc points to the block of input data. 00040 * @param[out] *pDst points to the block of output data 00041 * @param[in] *pScratchIn points to a temporary buffer of size blockSize. 00042 * @param[in] *pScratchOut points to a temporary buffer of size blockSize. 00043 * @param[in] blockSize number of input samples to process per call. 00044 * @return none. 00045 * 00046 * <b>Scaling and Overflow Behavior:</b> 00047 * \par 00048 * The function is implemented using an internal 32-bit accumulator. 00049 * The 1.15 x 1.15 multiplications yield a 2.30 result and these are added to a 2.30 accumulator. 00050 * Thus the full precision of the multiplications is maintained but there is only a single guard bit in the accumulator. 00051 * If the accumulator result overflows it will wrap around rather than saturate. 00052 * After all multiply-accumulates are performed, the 2.30 accumulator is truncated to 2.15 format and then saturated to 1.15 format. 00053 * In order to avoid overflows the input signal or coefficients must be scaled down by log2(numTaps) bits. 00054 */ 00055 00056 00057 void arm_fir_sparse_q15( 00058 arm_fir_sparse_instance_q15 * S, 00059 q15_t * pSrc, 00060 q15_t * pDst, 00061 q15_t * pScratchIn, 00062 q31_t * pScratchOut, 00063 uint32_t blockSize) 00064 { 00065 00066 q15_t *pState = S->pState; /* State pointer */ 00067 q15_t *pIn = (q15_t *) pSrc; /* Working pointer for input */ 00068 q15_t *pOut = pDst; /* Working pointer for output */ 00069 q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ 00070 q15_t *px; /* Temporary pointers for scratch buffer */ 00071 q15_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */ 00072 q15_t *py = pState; /* Temporary pointers for state buffer */ 00073 int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset of the non-zero tap values. */ 00074 uint32_t delaySize = S->maxDelay + blockSize; /* state length */ 00075 uint16_t numTaps = S->numTaps; /* Filter order */ 00076 int32_t readIndex; /* Read index of the state buffer */ 00077 uint32_t tapCnt, blkCnt; /* loop counters */ 00078 q15_t coeff = *pCoeffs++; /* Read the first coefficient value */ 00079 q31_t *pScr2 = pScratchOut; /* Working pointer for pScratchOut */ 00080 q31_t in1, in2; /* Temporary variables */ 00081 00082 00083 00084 /* BlockSize of Input samples are copied into the state buffer */ 00085 /* StateIndex points to the starting position to write in the state buffer */ 00086 arm_circularWrite_q15(py, delaySize, &S->stateIndex, 1, pIn, 1, blockSize); 00087 00088 /* Loop over the number of taps. */ 00089 tapCnt = numTaps; 00090 00091 /* Read Index, from where the state buffer should be read, is calculated. */ 00092 readIndex = (S->stateIndex - blockSize) - *pTapDelay++; 00093 00094 /* Wraparound of readIndex */ 00095 if(readIndex < 0) 00096 { 00097 readIndex += (int32_t) delaySize; 00098 } 00099 00100 /* Working pointer for state buffer is updated */ 00101 py = pState; 00102 00103 /* blockSize samples are read from the state buffer */ 00104 arm_circularRead_q15(py, delaySize, &readIndex, 1, 00105 pb, pb, blockSize, 1, blockSize); 00106 00107 /* Working pointer for the scratch buffer of state values */ 00108 px = pb; 00109 00110 /* Working pointer for scratch buffer of output values */ 00111 pScratchOut = pScr2; 00112 00113 /* Loop over the blockSize. Unroll by a factor of 4. 00114 * Compute 4 multiplications at a time. */ 00115 blkCnt = blockSize >> 2; 00116 00117 while(blkCnt > 0u) 00118 { 00119 /* Perform multiplication and store in the scratch buffer */ 00120 *pScratchOut++ = ((q31_t) * px++ * coeff); 00121 *pScratchOut++ = ((q31_t) * px++ * coeff); 00122 *pScratchOut++ = ((q31_t) * px++ * coeff); 00123 *pScratchOut++ = ((q31_t) * px++ * coeff); 00124 00125 /* Decrement the loop counter */ 00126 blkCnt--; 00127 } 00128 00129 /* If the blockSize is not a multiple of 4, 00130 * compute the remaining samples */ 00131 blkCnt = blockSize % 0x4u; 00132 00133 while(blkCnt > 0u) 00134 { 00135 /* Perform multiplication and store in the scratch buffer */ 00136 *pScratchOut++ = ((q31_t) * px++ * coeff); 00137 00138 /* Decrement the loop counter */ 00139 blkCnt--; 00140 } 00141 00142 /* Load the coefficient value and 00143 * increment the coefficient buffer for the next set of state values */ 00144 coeff = *pCoeffs++; 00145 00146 /* Read Index, from where the state buffer should be read, is calculated. */ 00147 readIndex = (S->stateIndex - blockSize) - *pTapDelay++; 00148 00149 /* Wraparound of readIndex */ 00150 if(readIndex < 0) 00151 { 00152 readIndex += (int32_t) delaySize; 00153 } 00154 00155 /* Loop over the number of taps. */ 00156 tapCnt = (uint32_t) numTaps - 1u; 00157 00158 while(tapCnt > 0u) 00159 { 00160 /* Working pointer for state buffer is updated */ 00161 py = pState; 00162 00163 /* blockSize samples are read from the state buffer */ 00164 arm_circularRead_q15(py, delaySize, &readIndex, 1, 00165 pb, pb, blockSize, 1, blockSize); 00166 00167 /* Working pointer for the scratch buffer of state values */ 00168 px = pb; 00169 00170 /* Working pointer for scratch buffer of output values */ 00171 pScratchOut = pScr2; 00172 00173 /* Loop over the blockSize. Unroll by a factor of 4. 00174 * Compute 4 MACS at a time. */ 00175 blkCnt = blockSize >> 2; 00176 00177 while(blkCnt > 0u) 00178 { 00179 /* Perform Multiply-Accumulate */ 00180 *pScratchOut++ += (q31_t) * px++ * coeff; 00181 *pScratchOut++ += (q31_t) * px++ * coeff; 00182 *pScratchOut++ += (q31_t) * px++ * coeff; 00183 *pScratchOut++ += (q31_t) * px++ * coeff; 00184 00185 /* Decrement the loop counter */ 00186 blkCnt--; 00187 } 00188 00189 /* If the blockSize is not a multiple of 4, 00190 * compute the remaining samples */ 00191 blkCnt = blockSize % 0x4u; 00192 00193 while(blkCnt > 0u) 00194 { 00195 /* Perform Multiply-Accumulate */ 00196 *pScratchOut++ += (q31_t) * px++ * coeff; 00197 00198 /* Decrement the loop counter */ 00199 blkCnt--; 00200 } 00201 00202 /* Load the coefficient value and 00203 * increment the coefficient buffer for the next set of state values */ 00204 coeff = *pCoeffs++; 00205 00206 /* Read Index, from where the state buffer should be read, is calculated. */ 00207 readIndex = (S->stateIndex - blockSize) - *pTapDelay++; 00208 00209 /* Wraparound of readIndex */ 00210 if(readIndex < 0) 00211 { 00212 readIndex += (int32_t) delaySize; 00213 } 00214 00215 /* Decrement the tap loop counter */ 00216 tapCnt--; 00217 } 00218 00219 /* All the output values are in pScratchOut buffer. 00220 Convert them into 1.15 format, saturate and store in the destination buffer. */ 00221 /* Loop over the blockSize. */ 00222 blkCnt = blockSize >> 2; 00223 00224 while(blkCnt > 0u) 00225 { 00226 in1 = *pScr2++; 00227 in2 = *pScr2++; 00228 *__SIMD32(pOut)++ = 00229 __PKHBT((q15_t) __SSAT(in1 >> 15, 16), (q15_t) __SSAT(in2 >> 15, 16), 00230 16); 00231 00232 in1 = *pScr2++; 00233 in2 = *pScr2++; 00234 *__SIMD32(pOut)++ = 00235 __PKHBT((q15_t) __SSAT(in1 >> 15, 16), (q15_t) __SSAT(in2 >> 15, 16), 00236 16); 00237 00238 blkCnt--; 00239 00240 } 00241 00242 /* If the blockSize is not a multiple of 4, 00243 remaining samples are processed in the below loop */ 00244 blkCnt = blockSize % 0x4u; 00245 00246 while(blkCnt > 0u) 00247 { 00248 *pOut++ = (q15_t) __SSAT(*pScr2++ >> 15, 16); 00249 blkCnt--; 00250 } 00251 } 00252 00253 /** 00254 * @} end of FIR_Sparse group 00255 */
Generated on Tue Jul 12 2022 14:13:53 by 1.7.2