CMSIS DSP Library from CMSIS 2.0. See http://www.onarm.com/cmsis/ for full details
Dependents: K22F_DSP_Matrix_least_square BNO055-ELEC3810 1BNO055 ECE4180Project--Slave2 ... more
arm_fir_fast_q15.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 29. November 2010 00005 * $Revision: V1.0.3 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_fir_fast_q15.c 00009 * 00010 * Description: Q15 Fast FIR filter processing function. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3 00013 * 00014 * Version 1.0.3 2010/11/29 00015 * Re-organized the CMSIS folders and updated documentation. 00016 * 00017 * Version 1.0.2 2010/11/11 00018 * Documentation updated. 00019 * 00020 * Version 1.0.1 2010/10/05 00021 * Production release and review comments incorporated. 00022 * 00023 * Version 1.0.0 2010/09/20 00024 * Production release and review comments incorporated. 00025 * 00026 * Version 0.0.9 2010/08/16 00027 * Initial version 00028 * 00029 * -------------------------------------------------------------------- */ 00030 00031 #include "arm_math.h" 00032 00033 /** 00034 * @ingroup groupFilters 00035 */ 00036 00037 /** 00038 * @addtogroup FIR 00039 * @{ 00040 */ 00041 00042 /** 00043 * @param[in] *S points to an instance of the Q15 FIR filter structure. 00044 * @param[in] *pSrc points to the block of input data. 00045 * @param[out] *pDst points to the block of output data. 00046 * @param[in] blockSize number of samples to process per call. 00047 * @return none. 00048 * 00049 * <b>Scaling and Overflow Behavior:</b> 00050 * \par 00051 * This fast version uses a 32-bit accumulator with 2.30 format. 00052 * The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit. 00053 * Thus, if the accumulator result overflows it wraps around and distorts the result. 00054 * In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits. 00055 * The 2.30 accumulator is then truncated to 2.15 format and saturated to yield the 1.15 result. 00056 * 00057 * \par 00058 * Refer to the function <code>arm_fir_q15()</code> for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion. Both the slow and the fast versions use the same instance structure. 00059 * Use the function <code>arm_fir_init_q15()</code> to initialize the filter structure. 00060 */ 00061 00062 void arm_fir_fast_q15( 00063 const arm_fir_instance_q15 * S, 00064 q15_t * pSrc, 00065 q15_t * pDst, 00066 uint32_t blockSize) 00067 { 00068 q15_t *pState = S->pState; /* State pointer */ 00069 q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ 00070 q15_t *pStateCurnt; /* Points to the current sample of the state */ 00071 q15_t *px1; /* Temporary q15 pointer for state buffer */ 00072 q31_t *pb; /* Temporary pointer for coefficient buffer */ 00073 q31_t *px2; /* Temporary q31 pointer for SIMD state buffer accesses */ 00074 q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold SIMD state and coefficient values */ 00075 q31_t acc0, acc1, acc2, acc3; /* Accumulators */ 00076 uint32_t numTaps = S->numTaps; /* Number of taps in the filter */ 00077 uint32_t tapCnt, blkCnt; /* Loop counters */ 00078 00079 /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */ 00080 /* pStateCurnt points to the location where the new input data should be written */ 00081 pStateCurnt = &(S->pState[(numTaps - 1u)]); 00082 00083 /* Apply loop unrolling and compute 4 output values simultaneously. 00084 * The variables acc0 ... acc3 hold output values that are being computed: 00085 * 00086 * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] 00087 * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1] 00088 * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2] 00089 * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3] 00090 */ 00091 blkCnt = blockSize >> 2; 00092 00093 /* First part of the processing with loop unrolling. Compute 4 outputs at a time. 00094 ** a second loop below computes the remaining 1 to 3 samples. */ 00095 while(blkCnt > 0u) 00096 { 00097 /* Copy four new input samples into the state buffer. 00098 ** Use 32-bit SIMD to move the 16-bit data. Only requires two copies. */ 00099 *__SIMD32(pStateCurnt)++ = *__SIMD32(pSrc)++; 00100 *__SIMD32(pStateCurnt)++ = *__SIMD32(pSrc)++; 00101 00102 /* Set all accumulators to zero */ 00103 acc0 = 0; 00104 acc1 = 0; 00105 acc2 = 0; 00106 acc3 = 0; 00107 00108 /* Initialize state pointer of type q15 */ 00109 px1 = pState; 00110 00111 /* Initialize coeff pointer of type q31 */ 00112 pb = (q31_t *) (pCoeffs); 00113 00114 /* Read the first two samples from the state buffer: x[n-N], x[n-N-1] */ 00115 x0 = *(q31_t *) (px1++); 00116 00117 /* Read the third and forth samples from the state buffer: x[n-N-1], x[n-N-2] */ 00118 x1 = *(q31_t *) (px1++); 00119 00120 /* Loop over the number of taps. Unroll by a factor of 4. 00121 ** Repeat until we've computed numTaps-4 coefficients. */ 00122 tapCnt = numTaps >> 2; 00123 do 00124 { 00125 /* Read the first two coefficients using SIMD: b[N] and b[N-1] coefficients */ 00126 c0 = *(pb++); 00127 00128 /* acc0 += b[N] * x[n-N] + b[N-1] * x[n-N-1] */ 00129 acc0 = __SMLAD(x0, c0, acc0); 00130 00131 /* acc1 += b[N] * x[n-N-1] + b[N-1] * x[n-N-2] */ 00132 acc1 = __SMLAD(x1, c0, acc1); 00133 00134 /* Read state x[n-N-2], x[n-N-3] */ 00135 x2 = *(q31_t *) (px1++); 00136 00137 /* Read state x[n-N-3], x[n-N-4] */ 00138 x3 = *(q31_t *) (px1++); 00139 00140 /* acc2 += b[N] * x[n-N-2] + b[N-1] * x[n-N-3] */ 00141 acc2 = __SMLAD(x2, c0, acc2); 00142 00143 /* acc3 += b[N] * x[n-N-3] + b[N-1] * x[n-N-4] */ 00144 acc3 = __SMLAD(x3, c0, acc3); 00145 00146 /* Read coefficients b[N-2], b[N-3] */ 00147 c0 = *(pb++); 00148 00149 /* acc0 += b[N-2] * x[n-N-2] + b[N-3] * x[n-N-3] */ 00150 acc0 = __SMLAD(x2, c0, acc0); 00151 00152 /* acc1 += b[N-2] * x[n-N-3] + b[N-3] * x[n-N-4] */ 00153 acc1 = __SMLAD(x3, c0, acc1); 00154 00155 /* Read state x[n-N-4], x[n-N-5] */ 00156 x0 = *(q31_t *) (px1++); 00157 00158 /* Read state x[n-N-5], x[n-N-6] */ 00159 x1 = *(q31_t *) (px1++); 00160 00161 /* acc2 += b[N-2] * x[n-N-4] + b[N-3] * x[n-N-5] */ 00162 acc2 = __SMLAD(x0, c0, acc2); 00163 00164 /* acc3 += b[N-2] * x[n-N-5] + b[N-3] * x[n-N-6] */ 00165 acc3 = __SMLAD(x1, c0, acc3); 00166 tapCnt--; 00167 00168 } 00169 while(tapCnt > 0u); 00170 00171 /* If the filter length is not a multiple of 4, compute the remaining filter taps. 00172 ** This is always 2 taps since the filter length is always even. */ 00173 if((numTaps & 0x3u) != 0u) 00174 { 00175 /* Read 2 coefficients */ 00176 c0 = *(pb++); 00177 /* Fetch 4 state variables */ 00178 x2 = *(q31_t *) (px1++); 00179 x3 = *(q31_t *) (px1++); 00180 00181 /* Perform the multiply-accumulates */ 00182 acc0 = __SMLAD(x0, c0, acc0); 00183 acc1 = __SMLAD(x1, c0, acc1); 00184 acc2 = __SMLAD(x2, c0, acc2); 00185 acc3 = __SMLAD(x3, c0, acc3); 00186 } 00187 00188 /* The results in the 4 accumulators are in 2.30 format. Convert to 1.15 with saturation. 00189 ** Then store the 4 outputs in the destination buffer. */ 00190 *__SIMD32(pDst)++ = __PKHBT((acc0 >> 15), (acc1 >> 15), 16u); 00191 *__SIMD32(pDst)++ = __PKHBT((acc2 >> 15), (acc3 >> 15), 16u); 00192 00193 00194 /* Advance the state pointer by 4 to process the next group of 4 samples */ 00195 pState = pState + 4; 00196 00197 /* Decrement the loop counter */ 00198 blkCnt--; 00199 } 00200 00201 /* If the blockSize is not a multiple of 4, compute any remaining output samples here. 00202 ** No loop unrolling is used. */ 00203 blkCnt = blockSize % 0x4u; 00204 while(blkCnt > 0u) 00205 { 00206 /* Copy two samples into state buffer */ 00207 *pStateCurnt++ = *pSrc++; 00208 00209 /* Set the accumulator to zero */ 00210 acc0 = 0; 00211 00212 /* Use SIMD to hold states and coefficients */ 00213 px2 = (q31_t *) pState; 00214 pb = (q31_t *) (pCoeffs); 00215 tapCnt = numTaps >> 1; 00216 00217 do 00218 { 00219 acc0 = __SMLAD(*px2++, *(pb++), acc0); 00220 tapCnt--; 00221 } 00222 while(tapCnt > 0u); 00223 00224 /* The result is in 2.30 format. Convert to 1.15 with saturation. 00225 ** Then store the output in the destination buffer. */ 00226 *pDst++ = (q15_t) ((acc0 >> 15)); 00227 00228 /* Advance state pointer by 1 for the next sample */ 00229 pState = pState + 1; 00230 00231 /* Decrement the loop counter */ 00232 blkCnt--; 00233 } 00234 00235 /* Processing is complete. 00236 ** Now copy the last numTaps - 1 samples to the satrt of the state buffer. 00237 ** This prepares the state buffer for the next function call. */ 00238 00239 /* Points to the start of the state buffer */ 00240 pStateCurnt = S->pState; 00241 /* Calculation of count for copying integer writes */ 00242 tapCnt = (numTaps - 1u) >> 2; 00243 00244 while(tapCnt > 0u) 00245 { 00246 *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++; 00247 *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++; 00248 00249 tapCnt--; 00250 } 00251 00252 /* Calculation of count for remaining q15_t data */ 00253 tapCnt = (numTaps - 1u) % 0x4u; 00254 00255 /* copy remaining data */ 00256 while(tapCnt > 0u) 00257 { 00258 *pStateCurnt++ = *pState++; 00259 00260 /* Decrement the loop counter */ 00261 tapCnt--; 00262 } 00263 } 00264 00265 /** 00266 * @} end of FIR group 00267 */
Generated on Tue Jul 12 2022 14:13:53 by 1.7.2