CMSIS DSP Library from CMSIS 2.0. See http://www.onarm.com/cmsis/ for full details
Dependents: K22F_DSP_Matrix_least_square BNO055-ELEC3810 1BNO055 ECE4180Project--Slave2 ... more
arm_fir_sparse_q7.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 29. November 2010 00005 * $Revision: V1.0.3 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_fir_sparse_q7.c 00009 * 00010 * Description: Q7 sparse FIR filter processing function. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3 00013 * 00014 * Version 1.0.3 2010/11/29 00015 * Re-organized the CMSIS folders and updated documentation. 00016 * 00017 * Version 1.0.2 2010/11/11 00018 * Documentation updated. 00019 * 00020 * Version 1.0.1 2010/10/05 00021 * Production release and review comments incorporated. 00022 * 00023 * Version 1.0.0 2010/09/20 00024 * Production release and review comments incorporated 00025 * 00026 * Version 0.0.7 2010/06/10 00027 * Misra-C changes done 00028 * ------------------------------------------------------------------- */ 00029 #include "arm_math.h" 00030 00031 00032 /** 00033 * @ingroup groupFilters 00034 */ 00035 00036 /** 00037 * @addtogroup FIR_Sparse 00038 * @{ 00039 */ 00040 00041 00042 /** 00043 * @brief Processing function for the Q7 sparse FIR filter. 00044 * @param[in] *S points to an instance of the Q7 sparse FIR structure. 00045 * @param[in] *pSrc points to the block of input data. 00046 * @param[out] *pDst points to the block of output data 00047 * @param[in] *pScratchIn points to a temporary buffer of size blockSize. 00048 * @param[in] *pScratchOut points to a temporary buffer of size blockSize. 00049 * @param[in] blockSize number of input samples to process per call. 00050 * @return none. 00051 * 00052 * <b>Scaling and Overflow Behavior:</b> 00053 * \par 00054 * The function is implemented using a 32-bit internal accumulator. 00055 * Both coefficients and state variables are represented in 1.7 format and multiplications yield a 2.14 result. 00056 * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format. 00057 * There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved. 00058 * The accumulator is then converted to 18.7 format by discarding the low 7 bits. 00059 * Finally, the result is truncated to 1.7 format. 00060 */ 00061 00062 void arm_fir_sparse_q7( 00063 arm_fir_sparse_instance_q7 * S, 00064 q7_t * pSrc, 00065 q7_t * pDst, 00066 q7_t * pScratchIn, 00067 q31_t * pScratchOut, 00068 uint32_t blockSize) 00069 { 00070 00071 q7_t *pState = S->pState; /* State pointer */ 00072 q7_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ 00073 q7_t *px; /* Scratch buffer pointer */ 00074 q7_t *py = pState; /* Temporary pointers for state buffer */ 00075 q7_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */ 00076 q7_t *pOut = pDst; /* Destination pointer */ 00077 int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset of the non-zero tap values. */ 00078 uint32_t delaySize = S->maxDelay + blockSize; /* state length */ 00079 uint16_t numTaps = S->numTaps; /* Filter order */ 00080 int32_t readIndex; /* Read index of the state buffer */ 00081 uint32_t tapCnt, blkCnt; /* loop counters */ 00082 q7_t coeff = *pCoeffs++; /* Read the coefficient value */ 00083 q31_t *pScr2 = pScratchOut; /* Working pointer for scratch buffer of output values */ 00084 q31_t in; 00085 q7_t in1, in2, in3, in4; 00086 00087 /* BlockSize of Input samples are copied into the state buffer */ 00088 /* StateIndex points to the starting position to write in the state buffer */ 00089 arm_circularWrite_q7(py, (int32_t) delaySize, &S->stateIndex, 1, pSrc, 1, 00090 blockSize); 00091 00092 /* Loop over the number of taps. */ 00093 tapCnt = numTaps; 00094 00095 /* Read Index, from where the state buffer should be read, is calculated. */ 00096 readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++; 00097 00098 /* Wraparound of readIndex */ 00099 if(readIndex < 0) 00100 { 00101 readIndex += (int32_t) delaySize; 00102 } 00103 00104 /* Working pointer for state buffer is updated */ 00105 py = pState; 00106 00107 /* blockSize samples are read from the state buffer */ 00108 arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb, 00109 (int32_t) blockSize, 1, blockSize); 00110 00111 /* Working pointer for the scratch buffer of state values */ 00112 px = pb; 00113 00114 /* Working pointer for scratch buffer of output values */ 00115 pScratchOut = pScr2; 00116 00117 /* Loop over the blockSize. Unroll by a factor of 4. 00118 * Compute 4 multiplications at a time. */ 00119 blkCnt = blockSize >> 2; 00120 00121 while(blkCnt > 0u) 00122 { 00123 /* Perform multiplication and store in the scratch buffer */ 00124 *pScratchOut++ = ((q31_t) * px++ * coeff); 00125 *pScratchOut++ = ((q31_t) * px++ * coeff); 00126 *pScratchOut++ = ((q31_t) * px++ * coeff); 00127 *pScratchOut++ = ((q31_t) * px++ * coeff); 00128 00129 /* Decrement the loop counter */ 00130 blkCnt--; 00131 } 00132 00133 /* If the blockSize is not a multiple of 4, 00134 * compute the remaining samples */ 00135 blkCnt = blockSize % 0x4u; 00136 00137 while(blkCnt > 0u) 00138 { 00139 /* Perform multiplication and store in the scratch buffer */ 00140 *pScratchOut++ = ((q31_t) * px++ * coeff); 00141 00142 /* Decrement the loop counter */ 00143 blkCnt--; 00144 } 00145 00146 /* Load the coefficient value and 00147 * increment the coefficient buffer for the next set of state values */ 00148 coeff = *pCoeffs++; 00149 00150 /* Read Index, from where the state buffer should be read, is calculated. */ 00151 readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++; 00152 00153 /* Wraparound of readIndex */ 00154 if(readIndex < 0) 00155 { 00156 readIndex += (int32_t) delaySize; 00157 } 00158 00159 /* Loop over the number of taps. */ 00160 tapCnt = (uint32_t) numTaps - 1u; 00161 00162 while(tapCnt > 0u) 00163 { 00164 /* Working pointer for state buffer is updated */ 00165 py = pState; 00166 00167 /* blockSize samples are read from the state buffer */ 00168 arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb, 00169 (int32_t) blockSize, 1, blockSize); 00170 00171 /* Working pointer for the scratch buffer of state values */ 00172 px = pb; 00173 00174 /* Working pointer for scratch buffer of output values */ 00175 pScratchOut = pScr2; 00176 00177 /* Loop over the blockSize. Unroll by a factor of 4. 00178 * Compute 4 MACS at a time. */ 00179 blkCnt = blockSize >> 2; 00180 00181 while(blkCnt > 0u) 00182 { 00183 /* Perform Multiply-Accumulate */ 00184 in = *pScratchOut + ((q31_t) * px++ * coeff); 00185 *pScratchOut++ = in; 00186 in = *pScratchOut + ((q31_t) * px++ * coeff); 00187 *pScratchOut++ = in; 00188 in = *pScratchOut + ((q31_t) * px++ * coeff); 00189 *pScratchOut++ = in; 00190 in = *pScratchOut + ((q31_t) * px++ * coeff); 00191 *pScratchOut++ = in; 00192 00193 /* Decrement the loop counter */ 00194 blkCnt--; 00195 } 00196 00197 /* If the blockSize is not a multiple of 4, 00198 * compute the remaining samples */ 00199 blkCnt = blockSize % 0x4u; 00200 00201 while(blkCnt > 0u) 00202 { 00203 /* Perform Multiply-Accumulate */ 00204 in = *pScratchOut + ((q31_t) * px++ * coeff); 00205 *pScratchOut++ = in; 00206 00207 /* Decrement the loop counter */ 00208 blkCnt--; 00209 } 00210 00211 /* Load the coefficient value and 00212 * increment the coefficient buffer for the next set of state values */ 00213 coeff = *pCoeffs++; 00214 00215 /* Read Index, from where the state buffer should be read, is calculated. */ 00216 readIndex = ((int32_t) S->stateIndex - 00217 (int32_t) blockSize) - *pTapDelay++; 00218 00219 /* Wraparound of readIndex */ 00220 if(readIndex < 0) 00221 { 00222 readIndex += (int32_t) delaySize; 00223 } 00224 00225 /* Decrement the tap loop counter */ 00226 tapCnt--; 00227 } 00228 00229 /* All the output values are in pScratchOut buffer. 00230 Convert them into 1.15 format, saturate and store in the destination buffer. */ 00231 /* Loop over the blockSize. */ 00232 blkCnt = blockSize >> 2; 00233 00234 while(blkCnt > 0u) 00235 { 00236 in1 = (q7_t) __SSAT(*pScr2++ >> 7, 8); 00237 in2 = (q7_t) __SSAT(*pScr2++ >> 7, 8); 00238 in3 = (q7_t) __SSAT(*pScr2++ >> 7, 8); 00239 in4 = (q7_t) __SSAT(*pScr2++ >> 7, 8); 00240 00241 *__SIMD32(pOut)++ = __PACKq7(in1, in2, in3, in4); 00242 00243 /* Decrement the blockSize loop counter */ 00244 blkCnt--; 00245 } 00246 00247 /* If the blockSize is not a multiple of 4, 00248 remaining samples are processed in the below loop */ 00249 blkCnt = blockSize % 0x4u; 00250 00251 while(blkCnt > 0u) 00252 { 00253 *pOut++ = (q7_t) __SSAT(*pScr2++ >> 7, 8); 00254 00255 /* Decrement the blockSize loop counter */ 00256 blkCnt--; 00257 } 00258 } 00259 00260 /** 00261 * @} end of FIR_Sparse group 00262 */
Generated on Tue Jul 12 2022 14:13:53 by 1.7.2