Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
Fork of mbed-dsp by
arm_fir_decimate_fast_q15.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010-2013 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 17. January 2013 00005 * $Revision: V1.4.1 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_fir_decimate_fast_q15.c 00009 * 00010 * Description: Fast Q15 FIR Decimator. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * -------------------------------------------------------------------- */ 00040 00041 #include "arm_math.h" 00042 00043 /** 00044 * @ingroup groupFilters 00045 */ 00046 00047 /** 00048 * @addtogroup FIR_decimate 00049 * @{ 00050 */ 00051 00052 /** 00053 * @brief Processing function for the Q15 FIR decimator (fast variant) for Cortex-M3 and Cortex-M4. 00054 * @param[in] *S points to an instance of the Q15 FIR decimator structure. 00055 * @param[in] *pSrc points to the block of input data. 00056 * @param[out] *pDst points to the block of output data 00057 * @param[in] blockSize number of input samples to process per call. 00058 * @return none 00059 * 00060 * \par Restrictions 00061 * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE 00062 * In this case input, output, state buffers should be aligned by 32-bit 00063 * 00064 * <b>Scaling and Overflow Behavior:</b> 00065 * \par 00066 * This fast version uses a 32-bit accumulator with 2.30 format. 00067 * The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit. 00068 * Thus, if the accumulator result overflows it wraps around and distorts the result. 00069 * In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits (log2 is read as log to the base 2). 00070 * The 2.30 accumulator is then truncated to 2.15 format and saturated to yield the 1.15 result. 00071 * 00072 * \par 00073 * Refer to the function <code>arm_fir_decimate_q15()</code> for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion. 00074 * Both the slow and the fast versions use the same instance structure. 00075 * Use the function <code>arm_fir_decimate_init_q15()</code> to initialize the filter structure. 00076 */ 00077 00078 #ifndef UNALIGNED_SUPPORT_DISABLE 00079 00080 void arm_fir_decimate_fast_q15( 00081 const arm_fir_decimate_instance_q15 * S, 00082 q15_t * pSrc, 00083 q15_t * pDst, 00084 uint32_t blockSize) 00085 { 00086 q15_t *pState = S->pState; /* State pointer */ 00087 q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ 00088 q15_t *pStateCurnt; /* Points to the current sample of the state */ 00089 q15_t *px; /* Temporary pointer for state buffer */ 00090 q15_t *pb; /* Temporary pointer coefficient buffer */ 00091 q31_t x0, x1, c0, c1; /* Temporary variables to hold state and coefficient values */ 00092 q31_t sum0; /* Accumulators */ 00093 q31_t acc0, acc1; 00094 q15_t *px0, *px1; 00095 uint32_t blkCntN3; 00096 uint32_t numTaps = S->numTaps; /* Number of taps */ 00097 uint32_t i, blkCnt, tapCnt, outBlockSize = blockSize / S->M; /* Loop counters */ 00098 00099 00100 /* S->pState buffer contains previous frame (numTaps - 1) samples */ 00101 /* pStateCurnt points to the location where the new input data should be written */ 00102 pStateCurnt = S->pState + (numTaps - 1u); 00103 00104 00105 /* Total number of output samples to be computed */ 00106 blkCnt = outBlockSize / 2; 00107 blkCntN3 = outBlockSize - (2 * blkCnt); 00108 00109 00110 while(blkCnt > 0u) 00111 { 00112 /* Copy decimation factor number of new input samples into the state buffer */ 00113 i = 2 * S->M; 00114 00115 do 00116 { 00117 *pStateCurnt++ = *pSrc++; 00118 00119 } while(--i); 00120 00121 /* Set accumulator to zero */ 00122 acc0 = 0; 00123 acc1 = 0; 00124 00125 /* Initialize state pointer */ 00126 px0 = pState; 00127 00128 px1 = pState + S->M; 00129 00130 00131 /* Initialize coeff pointer */ 00132 pb = pCoeffs; 00133 00134 /* Loop unrolling. Process 4 taps at a time. */ 00135 tapCnt = numTaps >> 2; 00136 00137 /* Loop over the number of taps. Unroll by a factor of 4. 00138 ** Repeat until we've computed numTaps-4 coefficients. */ 00139 while(tapCnt > 0u) 00140 { 00141 /* Read the Read b[numTaps-1] and b[numTaps-2] coefficients */ 00142 c0 = *__SIMD32(pb)++; 00143 00144 /* Read x[n-numTaps-1] and x[n-numTaps-2]sample */ 00145 x0 = *__SIMD32(px0)++; 00146 00147 x1 = *__SIMD32(px1)++; 00148 00149 /* Perform the multiply-accumulate */ 00150 acc0 = __SMLAD(x0, c0, acc0); 00151 00152 acc1 = __SMLAD(x1, c0, acc1); 00153 00154 /* Read the b[numTaps-3] and b[numTaps-4] coefficient */ 00155 c0 = *__SIMD32(pb)++; 00156 00157 /* Read x[n-numTaps-2] and x[n-numTaps-3] sample */ 00158 x0 = *__SIMD32(px0)++; 00159 00160 x1 = *__SIMD32(px1)++; 00161 00162 /* Perform the multiply-accumulate */ 00163 acc0 = __SMLAD(x0, c0, acc0); 00164 00165 acc1 = __SMLAD(x1, c0, acc1); 00166 00167 /* Decrement the loop counter */ 00168 tapCnt--; 00169 } 00170 00171 /* If the filter length is not a multiple of 4, compute the remaining filter taps */ 00172 tapCnt = numTaps % 0x4u; 00173 00174 while(tapCnt > 0u) 00175 { 00176 /* Read coefficients */ 00177 c0 = *pb++; 00178 00179 /* Fetch 1 state variable */ 00180 x0 = *px0++; 00181 00182 x1 = *px1++; 00183 00184 /* Perform the multiply-accumulate */ 00185 acc0 = __SMLAD(x0, c0, acc0); 00186 acc1 = __SMLAD(x1, c0, acc1); 00187 00188 /* Decrement the loop counter */ 00189 tapCnt--; 00190 } 00191 00192 /* Advance the state pointer by the decimation factor 00193 * to process the next group of decimation factor number samples */ 00194 pState = pState + S->M * 2; 00195 00196 /* Store filter output, smlad returns the values in 2.14 format */ 00197 /* so downsacle by 15 to get output in 1.15 */ 00198 *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16)); 00199 *pDst++ = (q15_t) (__SSAT((acc1 >> 15), 16)); 00200 00201 /* Decrement the loop counter */ 00202 blkCnt--; 00203 } 00204 00205 00206 00207 while(blkCntN3 > 0u) 00208 { 00209 /* Copy decimation factor number of new input samples into the state buffer */ 00210 i = S->M; 00211 00212 do 00213 { 00214 *pStateCurnt++ = *pSrc++; 00215 00216 } while(--i); 00217 00218 /*Set sum to zero */ 00219 sum0 = 0; 00220 00221 /* Initialize state pointer */ 00222 px = pState; 00223 00224 /* Initialize coeff pointer */ 00225 pb = pCoeffs; 00226 00227 /* Loop unrolling. Process 4 taps at a time. */ 00228 tapCnt = numTaps >> 2; 00229 00230 /* Loop over the number of taps. Unroll by a factor of 4. 00231 ** Repeat until we've computed numTaps-4 coefficients. */ 00232 while(tapCnt > 0u) 00233 { 00234 /* Read the Read b[numTaps-1] and b[numTaps-2] coefficients */ 00235 c0 = *__SIMD32(pb)++; 00236 00237 /* Read x[n-numTaps-1] and x[n-numTaps-2]sample */ 00238 x0 = *__SIMD32(px)++; 00239 00240 /* Read the b[numTaps-3] and b[numTaps-4] coefficient */ 00241 c1 = *__SIMD32(pb)++; 00242 00243 /* Perform the multiply-accumulate */ 00244 sum0 = __SMLAD(x0, c0, sum0); 00245 00246 /* Read x[n-numTaps-2] and x[n-numTaps-3] sample */ 00247 x0 = *__SIMD32(px)++; 00248 00249 /* Perform the multiply-accumulate */ 00250 sum0 = __SMLAD(x0, c1, sum0); 00251 00252 /* Decrement the loop counter */ 00253 tapCnt--; 00254 } 00255 00256 /* If the filter length is not a multiple of 4, compute the remaining filter taps */ 00257 tapCnt = numTaps % 0x4u; 00258 00259 while(tapCnt > 0u) 00260 { 00261 /* Read coefficients */ 00262 c0 = *pb++; 00263 00264 /* Fetch 1 state variable */ 00265 x0 = *px++; 00266 00267 /* Perform the multiply-accumulate */ 00268 sum0 = __SMLAD(x0, c0, sum0); 00269 00270 /* Decrement the loop counter */ 00271 tapCnt--; 00272 } 00273 00274 /* Advance the state pointer by the decimation factor 00275 * to process the next group of decimation factor number samples */ 00276 pState = pState + S->M; 00277 00278 /* Store filter output, smlad returns the values in 2.14 format */ 00279 /* so downsacle by 15 to get output in 1.15 */ 00280 *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16)); 00281 00282 /* Decrement the loop counter */ 00283 blkCntN3--; 00284 } 00285 00286 /* Processing is complete. 00287 ** Now copy the last numTaps - 1 samples to the satrt of the state buffer. 00288 ** This prepares the state buffer for the next function call. */ 00289 00290 /* Points to the start of the state buffer */ 00291 pStateCurnt = S->pState; 00292 00293 i = (numTaps - 1u) >> 2u; 00294 00295 /* copy data */ 00296 while(i > 0u) 00297 { 00298 *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++; 00299 *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++; 00300 00301 /* Decrement the loop counter */ 00302 i--; 00303 } 00304 00305 i = (numTaps - 1u) % 0x04u; 00306 00307 /* copy data */ 00308 while(i > 0u) 00309 { 00310 *pStateCurnt++ = *pState++; 00311 00312 /* Decrement the loop counter */ 00313 i--; 00314 } 00315 } 00316 00317 #else 00318 00319 00320 void arm_fir_decimate_fast_q15( 00321 const arm_fir_decimate_instance_q15 * S, 00322 q15_t * pSrc, 00323 q15_t * pDst, 00324 uint32_t blockSize) 00325 { 00326 q15_t *pState = S->pState; /* State pointer */ 00327 q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ 00328 q15_t *pStateCurnt; /* Points to the current sample of the state */ 00329 q15_t *px; /* Temporary pointer for state buffer */ 00330 q15_t *pb; /* Temporary pointer coefficient buffer */ 00331 q15_t x0, x1, c0; /* Temporary variables to hold state and coefficient values */ 00332 q31_t sum0; /* Accumulators */ 00333 q31_t acc0, acc1; 00334 q15_t *px0, *px1; 00335 uint32_t blkCntN3; 00336 uint32_t numTaps = S->numTaps; /* Number of taps */ 00337 uint32_t i, blkCnt, tapCnt, outBlockSize = blockSize / S->M; /* Loop counters */ 00338 00339 00340 /* S->pState buffer contains previous frame (numTaps - 1) samples */ 00341 /* pStateCurnt points to the location where the new input data should be written */ 00342 pStateCurnt = S->pState + (numTaps - 1u); 00343 00344 00345 /* Total number of output samples to be computed */ 00346 blkCnt = outBlockSize / 2; 00347 blkCntN3 = outBlockSize - (2 * blkCnt); 00348 00349 while(blkCnt > 0u) 00350 { 00351 /* Copy decimation factor number of new input samples into the state buffer */ 00352 i = 2 * S->M; 00353 00354 do 00355 { 00356 *pStateCurnt++ = *pSrc++; 00357 00358 } while(--i); 00359 00360 /* Set accumulator to zero */ 00361 acc0 = 0; 00362 acc1 = 0; 00363 00364 /* Initialize state pointer */ 00365 px0 = pState; 00366 00367 px1 = pState + S->M; 00368 00369 00370 /* Initialize coeff pointer */ 00371 pb = pCoeffs; 00372 00373 /* Loop unrolling. Process 4 taps at a time. */ 00374 tapCnt = numTaps >> 2; 00375 00376 /* Loop over the number of taps. Unroll by a factor of 4. 00377 ** Repeat until we've computed numTaps-4 coefficients. */ 00378 while(tapCnt > 0u) 00379 { 00380 /* Read the Read b[numTaps-1] coefficients */ 00381 c0 = *pb++; 00382 00383 /* Read x[n-numTaps-1] for sample 0 and for sample 1 */ 00384 x0 = *px0++; 00385 x1 = *px1++; 00386 00387 /* Perform the multiply-accumulate */ 00388 acc0 += x0 * c0; 00389 acc1 += x1 * c0; 00390 00391 /* Read the b[numTaps-2] coefficient */ 00392 c0 = *pb++; 00393 00394 /* Read x[n-numTaps-2] for sample 0 and sample 1 */ 00395 x0 = *px0++; 00396 x1 = *px1++; 00397 00398 /* Perform the multiply-accumulate */ 00399 acc0 += x0 * c0; 00400 acc1 += x1 * c0; 00401 00402 /* Read the b[numTaps-3] coefficients */ 00403 c0 = *pb++; 00404 00405 /* Read x[n-numTaps-3] for sample 0 and sample 1 */ 00406 x0 = *px0++; 00407 x1 = *px1++; 00408 00409 /* Perform the multiply-accumulate */ 00410 acc0 += x0 * c0; 00411 acc1 += x1 * c0; 00412 00413 /* Read the b[numTaps-4] coefficient */ 00414 c0 = *pb++; 00415 00416 /* Read x[n-numTaps-4] for sample 0 and sample 1 */ 00417 x0 = *px0++; 00418 x1 = *px1++; 00419 00420 /* Perform the multiply-accumulate */ 00421 acc0 += x0 * c0; 00422 acc1 += x1 * c0; 00423 00424 /* Decrement the loop counter */ 00425 tapCnt--; 00426 } 00427 00428 /* If the filter length is not a multiple of 4, compute the remaining filter taps */ 00429 tapCnt = numTaps % 0x4u; 00430 00431 while(tapCnt > 0u) 00432 { 00433 /* Read coefficients */ 00434 c0 = *pb++; 00435 00436 /* Fetch 1 state variable */ 00437 x0 = *px0++; 00438 x1 = *px1++; 00439 00440 /* Perform the multiply-accumulate */ 00441 acc0 += x0 * c0; 00442 acc1 += x1 * c0; 00443 00444 /* Decrement the loop counter */ 00445 tapCnt--; 00446 } 00447 00448 /* Advance the state pointer by the decimation factor 00449 * to process the next group of decimation factor number samples */ 00450 pState = pState + S->M * 2; 00451 00452 /* Store filter output, smlad returns the values in 2.14 format */ 00453 /* so downsacle by 15 to get output in 1.15 */ 00454 00455 *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16)); 00456 *pDst++ = (q15_t) (__SSAT((acc1 >> 15), 16)); 00457 00458 00459 /* Decrement the loop counter */ 00460 blkCnt--; 00461 } 00462 00463 while(blkCntN3 > 0u) 00464 { 00465 /* Copy decimation factor number of new input samples into the state buffer */ 00466 i = S->M; 00467 00468 do 00469 { 00470 *pStateCurnt++ = *pSrc++; 00471 00472 } while(--i); 00473 00474 /*Set sum to zero */ 00475 sum0 = 0; 00476 00477 /* Initialize state pointer */ 00478 px = pState; 00479 00480 /* Initialize coeff pointer */ 00481 pb = pCoeffs; 00482 00483 /* Loop unrolling. Process 4 taps at a time. */ 00484 tapCnt = numTaps >> 2; 00485 00486 /* Loop over the number of taps. Unroll by a factor of 4. 00487 ** Repeat until we've computed numTaps-4 coefficients. */ 00488 while(tapCnt > 0u) 00489 { 00490 /* Read the Read b[numTaps-1] coefficients */ 00491 c0 = *pb++; 00492 00493 /* Read x[n-numTaps-1] and sample */ 00494 x0 = *px++; 00495 00496 /* Perform the multiply-accumulate */ 00497 sum0 += x0 * c0; 00498 00499 /* Read the b[numTaps-2] coefficient */ 00500 c0 = *pb++; 00501 00502 /* Read x[n-numTaps-2] and sample */ 00503 x0 = *px++; 00504 00505 /* Perform the multiply-accumulate */ 00506 sum0 += x0 * c0; 00507 00508 /* Read the b[numTaps-3] coefficients */ 00509 c0 = *pb++; 00510 00511 /* Read x[n-numTaps-3] sample */ 00512 x0 = *px++; 00513 00514 /* Perform the multiply-accumulate */ 00515 sum0 += x0 * c0; 00516 00517 /* Read the b[numTaps-4] coefficient */ 00518 c0 = *pb++; 00519 00520 /* Read x[n-numTaps-4] sample */ 00521 x0 = *px++; 00522 00523 /* Perform the multiply-accumulate */ 00524 sum0 += x0 * c0; 00525 00526 /* Decrement the loop counter */ 00527 tapCnt--; 00528 } 00529 00530 /* If the filter length is not a multiple of 4, compute the remaining filter taps */ 00531 tapCnt = numTaps % 0x4u; 00532 00533 while(tapCnt > 0u) 00534 { 00535 /* Read coefficients */ 00536 c0 = *pb++; 00537 00538 /* Fetch 1 state variable */ 00539 x0 = *px++; 00540 00541 /* Perform the multiply-accumulate */ 00542 sum0 += x0 * c0; 00543 00544 /* Decrement the loop counter */ 00545 tapCnt--; 00546 } 00547 00548 /* Advance the state pointer by the decimation factor 00549 * to process the next group of decimation factor number samples */ 00550 pState = pState + S->M; 00551 00552 /* Store filter output, smlad returns the values in 2.14 format */ 00553 /* so downsacle by 15 to get output in 1.15 */ 00554 *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16)); 00555 00556 /* Decrement the loop counter */ 00557 blkCntN3--; 00558 } 00559 00560 /* Processing is complete. 00561 ** Now copy the last numTaps - 1 samples to the satrt of the state buffer. 00562 ** This prepares the state buffer for the next function call. */ 00563 00564 /* Points to the start of the state buffer */ 00565 pStateCurnt = S->pState; 00566 00567 i = (numTaps - 1u) >> 2u; 00568 00569 /* copy data */ 00570 while(i > 0u) 00571 { 00572 *pStateCurnt++ = *pState++; 00573 *pStateCurnt++ = *pState++; 00574 *pStateCurnt++ = *pState++; 00575 *pStateCurnt++ = *pState++; 00576 00577 /* Decrement the loop counter */ 00578 i--; 00579 } 00580 00581 i = (numTaps - 1u) % 0x04u; 00582 00583 /* copy data */ 00584 while(i > 0u) 00585 { 00586 *pStateCurnt++ = *pState++; 00587 00588 /* Decrement the loop counter */ 00589 i--; 00590 } 00591 } 00592 00593 00594 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 00595 00596 /** 00597 * @} end of FIR_decimate group 00598 */
Generated on Tue Jul 12 2022 18:44:09 by
