Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
Fork of mbed-dsp by
arm_fir_fast_q31.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010-2013 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 17. January 2013 00005 * $Revision: V1.4.1 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_fir_fast_q31.c 00009 * 00010 * Description: Processing function for the Q31 Fast FIR filter. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * -------------------------------------------------------------------- */ 00040 00041 #include "arm_math.h" 00042 00043 /** 00044 * @ingroup groupFilters 00045 */ 00046 00047 /** 00048 * @addtogroup FIR 00049 * @{ 00050 */ 00051 00052 /** 00053 * @param[in] *S points to an instance of the Q31 structure. 00054 * @param[in] *pSrc points to the block of input data. 00055 * @param[out] *pDst points to the block output data. 00056 * @param[in] blockSize number of samples to process per call. 00057 * @return none. 00058 * 00059 * <b>Scaling and Overflow Behavior:</b> 00060 * 00061 * \par 00062 * This function is optimized for speed at the expense of fixed-point precision and overflow protection. 00063 * The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format. 00064 * These intermediate results are added to a 2.30 accumulator. 00065 * Finally, the accumulator is saturated and converted to a 1.31 result. 00066 * The fast version has the same overflow behavior as the standard version and provides less precision since it discards the low 32 bits of each multiplication result. 00067 * In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits. 00068 * 00069 * \par 00070 * Refer to the function <code>arm_fir_q31()</code> for a slower implementation of this function which uses a 64-bit accumulator to provide higher precision. Both the slow and the fast versions use the same instance structure. 00071 * Use the function <code>arm_fir_init_q31()</code> to initialize the filter structure. 00072 */ 00073 00074 IAR_ONLY_LOW_OPTIMIZATION_ENTER 00075 void arm_fir_fast_q31( 00076 const arm_fir_instance_q31 * S, 00077 q31_t * pSrc, 00078 q31_t * pDst, 00079 uint32_t blockSize) 00080 { 00081 q31_t *pState = S->pState; /* State pointer */ 00082 q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ 00083 q31_t *pStateCurnt; /* Points to the current sample of the state */ 00084 q31_t x0, x1, x2, x3; /* Temporary variables to hold state */ 00085 q31_t c0; /* Temporary variable to hold coefficient value */ 00086 q31_t *px; /* Temporary pointer for state */ 00087 q31_t *pb; /* Temporary pointer for coefficient buffer */ 00088 q31_t acc0, acc1, acc2, acc3; /* Accumulators */ 00089 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ 00090 uint32_t i, tapCnt, blkCnt; /* Loop counters */ 00091 00092 /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */ 00093 /* pStateCurnt points to the location where the new input data should be written */ 00094 pStateCurnt = &(S->pState[(numTaps - 1u)]); 00095 00096 /* Apply loop unrolling and compute 4 output values simultaneously. 00097 * The variables acc0 ... acc3 hold output values that are being computed: 00098 * 00099 * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] 00100 * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1] 00101 * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2] 00102 * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3] 00103 */ 00104 blkCnt = blockSize >> 2; 00105 00106 /* First part of the processing with loop unrolling. Compute 4 outputs at a time. 00107 ** a second loop below computes the remaining 1 to 3 samples. */ 00108 while(blkCnt > 0u) 00109 { 00110 /* Copy four new input samples into the state buffer */ 00111 *pStateCurnt++ = *pSrc++; 00112 *pStateCurnt++ = *pSrc++; 00113 *pStateCurnt++ = *pSrc++; 00114 *pStateCurnt++ = *pSrc++; 00115 00116 /* Set all accumulators to zero */ 00117 acc0 = 0; 00118 acc1 = 0; 00119 acc2 = 0; 00120 acc3 = 0; 00121 00122 /* Initialize state pointer */ 00123 px = pState; 00124 00125 /* Initialize coefficient pointer */ 00126 pb = pCoeffs; 00127 00128 /* Read the first three samples from the state buffer: 00129 * x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */ 00130 x0 = *(px++); 00131 x1 = *(px++); 00132 x2 = *(px++); 00133 00134 /* Loop unrolling. Process 4 taps at a time. */ 00135 tapCnt = numTaps >> 2; 00136 i = tapCnt; 00137 00138 while(i > 0u) 00139 { 00140 /* Read the b[numTaps] coefficient */ 00141 c0 = *(pb++); 00142 00143 /* Read x[n-numTaps-3] sample */ 00144 x3 = *(px++); 00145 00146 /* acc0 += b[numTaps] * x[n-numTaps] */ 00147 multAcc_32x32_keep32_R(acc0, x0, c0); 00148 00149 /* acc1 += b[numTaps] * x[n-numTaps-1] */ 00150 multAcc_32x32_keep32_R(acc1, x1, c0); 00151 00152 /* acc2 += b[numTaps] * x[n-numTaps-2] */ 00153 multAcc_32x32_keep32_R(acc2, x2, c0); 00154 00155 /* acc3 += b[numTaps] * x[n-numTaps-3] */ 00156 multAcc_32x32_keep32_R(acc3, x3, c0); 00157 00158 /* Read the b[numTaps-1] coefficient */ 00159 c0 = *(pb++); 00160 00161 /* Read x[n-numTaps-4] sample */ 00162 x0 = *(px++); 00163 00164 /* Perform the multiply-accumulates */ 00165 multAcc_32x32_keep32_R(acc0, x1, c0); 00166 multAcc_32x32_keep32_R(acc1, x2, c0); 00167 multAcc_32x32_keep32_R(acc2, x3, c0); 00168 multAcc_32x32_keep32_R(acc3, x0, c0); 00169 00170 /* Read the b[numTaps-2] coefficient */ 00171 c0 = *(pb++); 00172 00173 /* Read x[n-numTaps-5] sample */ 00174 x1 = *(px++); 00175 00176 /* Perform the multiply-accumulates */ 00177 multAcc_32x32_keep32_R(acc0, x2, c0); 00178 multAcc_32x32_keep32_R(acc1, x3, c0); 00179 multAcc_32x32_keep32_R(acc2, x0, c0); 00180 multAcc_32x32_keep32_R(acc3, x1, c0); 00181 00182 /* Read the b[numTaps-3] coefficients */ 00183 c0 = *(pb++); 00184 00185 /* Read x[n-numTaps-6] sample */ 00186 x2 = *(px++); 00187 00188 /* Perform the multiply-accumulates */ 00189 multAcc_32x32_keep32_R(acc0, x3, c0); 00190 multAcc_32x32_keep32_R(acc1, x0, c0); 00191 multAcc_32x32_keep32_R(acc2, x1, c0); 00192 multAcc_32x32_keep32_R(acc3, x2, c0); 00193 i--; 00194 } 00195 00196 /* If the filter length is not a multiple of 4, compute the remaining filter taps */ 00197 00198 i = numTaps - (tapCnt * 4u); 00199 while(i > 0u) 00200 { 00201 /* Read coefficients */ 00202 c0 = *(pb++); 00203 00204 /* Fetch 1 state variable */ 00205 x3 = *(px++); 00206 00207 /* Perform the multiply-accumulates */ 00208 multAcc_32x32_keep32_R(acc0, x0, c0); 00209 multAcc_32x32_keep32_R(acc1, x1, c0); 00210 multAcc_32x32_keep32_R(acc2, x2, c0); 00211 multAcc_32x32_keep32_R(acc3, x3, c0); 00212 00213 /* Reuse the present sample states for next sample */ 00214 x0 = x1; 00215 x1 = x2; 00216 x2 = x3; 00217 00218 /* Decrement the loop counter */ 00219 i--; 00220 } 00221 00222 /* Advance the state pointer by 4 to process the next group of 4 samples */ 00223 pState = pState + 4; 00224 00225 /* The results in the 4 accumulators are in 2.30 format. Convert to 1.31 00226 ** Then store the 4 outputs in the destination buffer. */ 00227 *pDst++ = (q31_t) (acc0 << 1); 00228 *pDst++ = (q31_t) (acc1 << 1); 00229 *pDst++ = (q31_t) (acc2 << 1); 00230 *pDst++ = (q31_t) (acc3 << 1); 00231 00232 /* Decrement the samples loop counter */ 00233 blkCnt--; 00234 } 00235 00236 00237 /* If the blockSize is not a multiple of 4, compute any remaining output samples here. 00238 ** No loop unrolling is used. */ 00239 blkCnt = blockSize % 4u; 00240 00241 while(blkCnt > 0u) 00242 { 00243 /* Copy one sample at a time into state buffer */ 00244 *pStateCurnt++ = *pSrc++; 00245 00246 /* Set the accumulator to zero */ 00247 acc0 = 0; 00248 00249 /* Initialize state pointer */ 00250 px = pState; 00251 00252 /* Initialize Coefficient pointer */ 00253 pb = (pCoeffs); 00254 00255 i = numTaps; 00256 00257 /* Perform the multiply-accumulates */ 00258 do 00259 { 00260 multAcc_32x32_keep32_R(acc0, (*px++), (*(pb++))); 00261 i--; 00262 } while(i > 0u); 00263 00264 /* The result is in 2.30 format. Convert to 1.31 00265 ** Then store the output in the destination buffer. */ 00266 *pDst++ = (q31_t) (acc0 << 1); 00267 00268 /* Advance state pointer by 1 for the next sample */ 00269 pState = pState + 1; 00270 00271 /* Decrement the samples loop counter */ 00272 blkCnt--; 00273 } 00274 00275 /* Processing is complete. 00276 ** Now copy the last numTaps - 1 samples to the start of the state buffer. 00277 ** This prepares the state buffer for the next function call. */ 00278 00279 /* Points to the start of the state buffer */ 00280 pStateCurnt = S->pState; 00281 00282 /* Calculate remaining number of copies */ 00283 tapCnt = (numTaps - 1u); 00284 00285 /* Copy the remaining q31_t data */ 00286 while(tapCnt > 0u) 00287 { 00288 *pStateCurnt++ = *pState++; 00289 00290 /* Decrement the loop counter */ 00291 tapCnt--; 00292 } 00293 00294 00295 } 00296 IAR_ONLY_LOW_OPTIMIZATION_EXIT 00297 /** 00298 * @} end of FIR group 00299 */
Generated on Tue Jul 12 2022 18:44:09 by
1.7.2
