CMSIS DSP library
Dependents: performance_timer Surfboard_ gps2rtty Capstone ... more
arm_fir_fast_q15.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010-2014 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 19. March 2015 00005 * $Revision: V.1.4.5 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_fir_fast_q15.c 00009 * 00010 * Description: Q15 Fast FIR filter processing function. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * -------------------------------------------------------------------- */ 00040 00041 #include "arm_math.h" 00042 00043 /** 00044 * @ingroup groupFilters 00045 */ 00046 00047 /** 00048 * @addtogroup FIR 00049 * @{ 00050 */ 00051 00052 /** 00053 * @param[in] *S points to an instance of the Q15 FIR filter structure. 00054 * @param[in] *pSrc points to the block of input data. 00055 * @param[out] *pDst points to the block of output data. 00056 * @param[in] blockSize number of samples to process per call. 00057 * @return none. 00058 * 00059 * <b>Scaling and Overflow Behavior:</b> 00060 * \par 00061 * This fast version uses a 32-bit accumulator with 2.30 format. 00062 * The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit. 00063 * Thus, if the accumulator result overflows it wraps around and distorts the result. 00064 * In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits. 00065 * The 2.30 accumulator is then truncated to 2.15 format and saturated to yield the 1.15 result. 00066 * 00067 * \par 00068 * Refer to the function <code>arm_fir_q15()</code> for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion. Both the slow and the fast versions use the same instance structure. 00069 * Use the function <code>arm_fir_init_q15()</code> to initialize the filter structure. 00070 */ 00071 00072 void arm_fir_fast_q15( 00073 const arm_fir_instance_q15 * S, 00074 q15_t * pSrc, 00075 q15_t * pDst, 00076 uint32_t blockSize) 00077 { 00078 q15_t *pState = S->pState; /* State pointer */ 00079 q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ 00080 q15_t *pStateCurnt; /* Points to the current sample of the state */ 00081 q31_t acc0, acc1, acc2, acc3; /* Accumulators */ 00082 q15_t *pb; /* Temporary pointer for coefficient buffer */ 00083 q15_t *px; /* Temporary q31 pointer for SIMD state buffer accesses */ 00084 q31_t x0, x1, x2, c0; /* Temporary variables to hold SIMD state and coefficient values */ 00085 uint32_t numTaps = S->numTaps; /* Number of taps in the filter */ 00086 uint32_t tapCnt, blkCnt; /* Loop counters */ 00087 00088 00089 /* S->pState points to state array which contains previous frame (numTaps - 1) samples */ 00090 /* pStateCurnt points to the location where the new input data should be written */ 00091 pStateCurnt = &(S->pState[(numTaps - 1u)]); 00092 00093 /* Apply loop unrolling and compute 4 output values simultaneously. 00094 * The variables acc0 ... acc3 hold output values that are being computed: 00095 * 00096 * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] 00097 * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1] 00098 * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2] 00099 * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3] 00100 */ 00101 00102 blkCnt = blockSize >> 2; 00103 00104 /* First part of the processing with loop unrolling. Compute 4 outputs at a time. 00105 ** a second loop below computes the remaining 1 to 3 samples. */ 00106 while(blkCnt > 0u) 00107 { 00108 /* Copy four new input samples into the state buffer. 00109 ** Use 32-bit SIMD to move the 16-bit data. Only requires two copies. */ 00110 *pStateCurnt++ = *pSrc++; 00111 *pStateCurnt++ = *pSrc++; 00112 *pStateCurnt++ = *pSrc++; 00113 *pStateCurnt++ = *pSrc++; 00114 00115 00116 /* Set all accumulators to zero */ 00117 acc0 = 0; 00118 acc1 = 0; 00119 acc2 = 0; 00120 acc3 = 0; 00121 00122 /* Typecast q15_t pointer to q31_t pointer for state reading in q31_t */ 00123 px = pState; 00124 00125 /* Typecast q15_t pointer to q31_t pointer for coefficient reading in q31_t */ 00126 pb = pCoeffs; 00127 00128 /* Read the first two samples from the state buffer: x[n-N], x[n-N-1] */ 00129 x0 = *__SIMD32(px)++; 00130 00131 /* Read the third and forth samples from the state buffer: x[n-N-2], x[n-N-3] */ 00132 x2 = *__SIMD32(px)++; 00133 00134 /* Loop over the number of taps. Unroll by a factor of 4. 00135 ** Repeat until we've computed numTaps-(numTaps%4) coefficients. */ 00136 tapCnt = numTaps >> 2; 00137 00138 while(tapCnt > 0) 00139 { 00140 /* Read the first two coefficients using SIMD: b[N] and b[N-1] coefficients */ 00141 c0 = *__SIMD32(pb)++; 00142 00143 /* acc0 += b[N] * x[n-N] + b[N-1] * x[n-N-1] */ 00144 acc0 = __SMLAD(x0, c0, acc0); 00145 00146 /* acc2 += b[N] * x[n-N-2] + b[N-1] * x[n-N-3] */ 00147 acc2 = __SMLAD(x2, c0, acc2); 00148 00149 /* pack x[n-N-1] and x[n-N-2] */ 00150 #ifndef ARM_MATH_BIG_ENDIAN 00151 x1 = __PKHBT(x2, x0, 0); 00152 #else 00153 x1 = __PKHBT(x0, x2, 0); 00154 #endif 00155 00156 /* Read state x[n-N-4], x[n-N-5] */ 00157 x0 = _SIMD32_OFFSET(px); 00158 00159 /* acc1 += b[N] * x[n-N-1] + b[N-1] * x[n-N-2] */ 00160 acc1 = __SMLADX(x1, c0, acc1); 00161 00162 /* pack x[n-N-3] and x[n-N-4] */ 00163 #ifndef ARM_MATH_BIG_ENDIAN 00164 x1 = __PKHBT(x0, x2, 0); 00165 #else 00166 x1 = __PKHBT(x2, x0, 0); 00167 #endif 00168 00169 /* acc3 += b[N] * x[n-N-3] + b[N-1] * x[n-N-4] */ 00170 acc3 = __SMLADX(x1, c0, acc3); 00171 00172 /* Read coefficients b[N-2], b[N-3] */ 00173 c0 = *__SIMD32(pb)++; 00174 00175 /* acc0 += b[N-2] * x[n-N-2] + b[N-3] * x[n-N-3] */ 00176 acc0 = __SMLAD(x2, c0, acc0); 00177 00178 /* Read state x[n-N-6], x[n-N-7] with offset */ 00179 x2 = _SIMD32_OFFSET(px + 2u); 00180 00181 /* acc2 += b[N-2] * x[n-N-4] + b[N-3] * x[n-N-5] */ 00182 acc2 = __SMLAD(x0, c0, acc2); 00183 00184 /* acc1 += b[N-2] * x[n-N-3] + b[N-3] * x[n-N-4] */ 00185 acc1 = __SMLADX(x1, c0, acc1); 00186 00187 /* pack x[n-N-5] and x[n-N-6] */ 00188 #ifndef ARM_MATH_BIG_ENDIAN 00189 x1 = __PKHBT(x2, x0, 0); 00190 #else 00191 x1 = __PKHBT(x0, x2, 0); 00192 #endif 00193 00194 /* acc3 += b[N-2] * x[n-N-5] + b[N-3] * x[n-N-6] */ 00195 acc3 = __SMLADX(x1, c0, acc3); 00196 00197 /* Update state pointer for next state reading */ 00198 px += 4u; 00199 00200 /* Decrement tap count */ 00201 tapCnt--; 00202 00203 } 00204 00205 /* If the filter length is not a multiple of 4, compute the remaining filter taps. 00206 ** This is always be 2 taps since the filter length is even. */ 00207 if((numTaps & 0x3u) != 0u) 00208 { 00209 00210 /* Read last two coefficients */ 00211 c0 = *__SIMD32(pb)++; 00212 00213 /* Perform the multiply-accumulates */ 00214 acc0 = __SMLAD(x0, c0, acc0); 00215 acc2 = __SMLAD(x2, c0, acc2); 00216 00217 /* pack state variables */ 00218 #ifndef ARM_MATH_BIG_ENDIAN 00219 x1 = __PKHBT(x2, x0, 0); 00220 #else 00221 x1 = __PKHBT(x0, x2, 0); 00222 #endif 00223 00224 /* Read last state variables */ 00225 x0 = *__SIMD32(px); 00226 00227 /* Perform the multiply-accumulates */ 00228 acc1 = __SMLADX(x1, c0, acc1); 00229 00230 /* pack state variables */ 00231 #ifndef ARM_MATH_BIG_ENDIAN 00232 x1 = __PKHBT(x0, x2, 0); 00233 #else 00234 x1 = __PKHBT(x2, x0, 0); 00235 #endif 00236 00237 /* Perform the multiply-accumulates */ 00238 acc3 = __SMLADX(x1, c0, acc3); 00239 } 00240 00241 /* The results in the 4 accumulators are in 2.30 format. Convert to 1.15 with saturation. 00242 ** Then store the 4 outputs in the destination buffer. */ 00243 00244 #ifndef ARM_MATH_BIG_ENDIAN 00245 00246 *__SIMD32(pDst)++ = 00247 __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16); 00248 00249 *__SIMD32(pDst)++ = 00250 __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16); 00251 00252 #else 00253 00254 *__SIMD32(pDst)++ = 00255 __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16); 00256 00257 *__SIMD32(pDst)++ = 00258 __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16); 00259 00260 00261 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00262 00263 /* Advance the state pointer by 4 to process the next group of 4 samples */ 00264 pState = pState + 4u; 00265 00266 /* Decrement the loop counter */ 00267 blkCnt--; 00268 } 00269 00270 /* If the blockSize is not a multiple of 4, compute any remaining output samples here. 00271 ** No loop unrolling is used. */ 00272 blkCnt = blockSize % 0x4u; 00273 while(blkCnt > 0u) 00274 { 00275 /* Copy two samples into state buffer */ 00276 *pStateCurnt++ = *pSrc++; 00277 00278 /* Set the accumulator to zero */ 00279 acc0 = 0; 00280 00281 /* Use SIMD to hold states and coefficients */ 00282 px = pState; 00283 pb = pCoeffs; 00284 00285 tapCnt = numTaps >> 1u; 00286 00287 do 00288 { 00289 00290 acc0 += (q31_t) * px++ * *pb++; 00291 acc0 += (q31_t) * px++ * *pb++; 00292 00293 tapCnt--; 00294 } 00295 while(tapCnt > 0u); 00296 00297 /* The result is in 2.30 format. Convert to 1.15 with saturation. 00298 ** Then store the output in the destination buffer. */ 00299 *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16)); 00300 00301 /* Advance state pointer by 1 for the next sample */ 00302 pState = pState + 1u; 00303 00304 /* Decrement the loop counter */ 00305 blkCnt--; 00306 } 00307 00308 /* Processing is complete. 00309 ** Now copy the last numTaps - 1 samples to the satrt of the state buffer. 00310 ** This prepares the state buffer for the next function call. */ 00311 00312 /* Points to the start of the state buffer */ 00313 pStateCurnt = S->pState; 00314 00315 /* Calculation of count for copying integer writes */ 00316 tapCnt = (numTaps - 1u) >> 2; 00317 00318 while(tapCnt > 0u) 00319 { 00320 *pStateCurnt++ = *pState++; 00321 *pStateCurnt++ = *pState++; 00322 *pStateCurnt++ = *pState++; 00323 *pStateCurnt++ = *pState++; 00324 00325 tapCnt--; 00326 00327 } 00328 00329 /* Calculation of count for remaining q15_t data */ 00330 tapCnt = (numTaps - 1u) % 0x4u; 00331 00332 /* copy remaining data */ 00333 while(tapCnt > 0u) 00334 { 00335 *pStateCurnt++ = *pState++; 00336 00337 /* Decrement the loop counter */ 00338 tapCnt--; 00339 } 00340 00341 } 00342 00343 /** 00344 * @} end of FIR group 00345 */
Generated on Tue Jul 12 2022 11:59:17 by 1.7.2