Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
arm_fir_decimate_fast_q15.c
00001 /* ---------------------------------------------------------------------- 00002 * Project: CMSIS DSP Library 00003 * Title: arm_fir_decimate_fast_q15.c 00004 * Description: Fast Q15 FIR Decimator 00005 * 00006 * $Date: 27. January 2017 00007 * $Revision: V.1.5.1 00008 * 00009 * Target Processor: Cortex-M cores 00010 * -------------------------------------------------------------------- */ 00011 /* 00012 * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. 00013 * 00014 * SPDX-License-Identifier: Apache-2.0 00015 * 00016 * Licensed under the Apache License, Version 2.0 (the License); you may 00017 * not use this file except in compliance with the License. 00018 * You may obtain a copy of the License at 00019 * 00020 * www.apache.org/licenses/LICENSE-2.0 00021 * 00022 * Unless required by applicable law or agreed to in writing, software 00023 * distributed under the License is distributed on an AS IS BASIS, WITHOUT 00024 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00025 * See the License for the specific language governing permissions and 00026 * limitations under the License. 00027 */ 00028 00029 #include "arm_math.h" 00030 00031 /** 00032 * @ingroup groupFilters 00033 */ 00034 00035 /** 00036 * @addtogroup FIR_decimate 00037 * @{ 00038 */ 00039 00040 /** 00041 * @brief Processing function for the Q15 FIR decimator (fast variant) for Cortex-M3 and Cortex-M4. 00042 * @param[in] *S points to an instance of the Q15 FIR decimator structure. 00043 * @param[in] *pSrc points to the block of input data. 00044 * @param[out] *pDst points to the block of output data 00045 * @param[in] blockSize number of input samples to process per call. 00046 * @return none 00047 * 00048 * \par Restrictions 00049 * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE 00050 * In this case input, output, state buffers should be aligned by 32-bit 00051 * 00052 * <b>Scaling and Overflow Behavior:</b> 00053 * \par 00054 * This fast version uses a 32-bit accumulator with 2.30 format. 00055 * The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit. 00056 * Thus, if the accumulator result overflows it wraps around and distorts the result. 00057 * In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits (log2 is read as log to the base 2). 00058 * The 2.30 accumulator is then truncated to 2.15 format and saturated to yield the 1.15 result. 00059 * 00060 * \par 00061 * Refer to the function <code>arm_fir_decimate_q15()</code> for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion. 00062 * Both the slow and the fast versions use the same instance structure. 00063 * Use the function <code>arm_fir_decimate_init_q15()</code> to initialize the filter structure. 00064 */ 00065 00066 #ifndef UNALIGNED_SUPPORT_DISABLE 00067 00068 void arm_fir_decimate_fast_q15( 00069 const arm_fir_decimate_instance_q15 * S, 00070 q15_t * pSrc, 00071 q15_t * pDst, 00072 uint32_t blockSize) 00073 { 00074 q15_t *pState = S->pState; /* State pointer */ 00075 q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ 00076 q15_t *pStateCurnt; /* Points to the current sample of the state */ 00077 q15_t *px; /* Temporary pointer for state buffer */ 00078 q15_t *pb; /* Temporary pointer coefficient buffer */ 00079 q31_t x0, x1, c0, c1; /* Temporary variables to hold state and coefficient values */ 00080 q31_t sum0; /* Accumulators */ 00081 q31_t acc0, acc1; 00082 q15_t *px0, *px1; 00083 uint32_t blkCntN3; 00084 uint32_t numTaps = S->numTaps; /* Number of taps */ 00085 uint32_t i, blkCnt, tapCnt, outBlockSize = blockSize / S->M; /* Loop counters */ 00086 00087 00088 /* S->pState buffer contains previous frame (numTaps - 1) samples */ 00089 /* pStateCurnt points to the location where the new input data should be written */ 00090 pStateCurnt = S->pState + (numTaps - 1U); 00091 00092 00093 /* Total number of output samples to be computed */ 00094 blkCnt = outBlockSize / 2; 00095 blkCntN3 = outBlockSize - (2 * blkCnt); 00096 00097 00098 while (blkCnt > 0U) 00099 { 00100 /* Copy decimation factor number of new input samples into the state buffer */ 00101 i = 2 * S->M; 00102 00103 do 00104 { 00105 *pStateCurnt++ = *pSrc++; 00106 00107 } while (--i); 00108 00109 /* Set accumulator to zero */ 00110 acc0 = 0; 00111 acc1 = 0; 00112 00113 /* Initialize state pointer */ 00114 px0 = pState; 00115 00116 px1 = pState + S->M; 00117 00118 00119 /* Initialize coeff pointer */ 00120 pb = pCoeffs; 00121 00122 /* Loop unrolling. Process 4 taps at a time. */ 00123 tapCnt = numTaps >> 2; 00124 00125 /* Loop over the number of taps. Unroll by a factor of 4. 00126 ** Repeat until we've computed numTaps-4 coefficients. */ 00127 while (tapCnt > 0U) 00128 { 00129 /* Read the Read b[numTaps-1] and b[numTaps-2] coefficients */ 00130 c0 = *__SIMD32(pb)++; 00131 00132 /* Read x[n-numTaps-1] and x[n-numTaps-2]sample */ 00133 x0 = *__SIMD32(px0)++; 00134 00135 x1 = *__SIMD32(px1)++; 00136 00137 /* Perform the multiply-accumulate */ 00138 acc0 = __SMLAD(x0, c0, acc0); 00139 00140 acc1 = __SMLAD(x1, c0, acc1); 00141 00142 /* Read the b[numTaps-3] and b[numTaps-4] coefficient */ 00143 c0 = *__SIMD32(pb)++; 00144 00145 /* Read x[n-numTaps-2] and x[n-numTaps-3] sample */ 00146 x0 = *__SIMD32(px0)++; 00147 00148 x1 = *__SIMD32(px1)++; 00149 00150 /* Perform the multiply-accumulate */ 00151 acc0 = __SMLAD(x0, c0, acc0); 00152 00153 acc1 = __SMLAD(x1, c0, acc1); 00154 00155 /* Decrement the loop counter */ 00156 tapCnt--; 00157 } 00158 00159 /* If the filter length is not a multiple of 4, compute the remaining filter taps */ 00160 tapCnt = numTaps % 0x4U; 00161 00162 while (tapCnt > 0U) 00163 { 00164 /* Read coefficients */ 00165 c0 = *pb++; 00166 00167 /* Fetch 1 state variable */ 00168 x0 = *px0++; 00169 00170 x1 = *px1++; 00171 00172 /* Perform the multiply-accumulate */ 00173 acc0 = __SMLAD(x0, c0, acc0); 00174 acc1 = __SMLAD(x1, c0, acc1); 00175 00176 /* Decrement the loop counter */ 00177 tapCnt--; 00178 } 00179 00180 /* Advance the state pointer by the decimation factor 00181 * to process the next group of decimation factor number samples */ 00182 pState = pState + S->M * 2; 00183 00184 /* Store filter output, smlad returns the values in 2.14 format */ 00185 /* so downsacle by 15 to get output in 1.15 */ 00186 *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16)); 00187 *pDst++ = (q15_t) (__SSAT((acc1 >> 15), 16)); 00188 00189 /* Decrement the loop counter */ 00190 blkCnt--; 00191 } 00192 00193 00194 00195 while (blkCntN3 > 0U) 00196 { 00197 /* Copy decimation factor number of new input samples into the state buffer */ 00198 i = S->M; 00199 00200 do 00201 { 00202 *pStateCurnt++ = *pSrc++; 00203 00204 } while (--i); 00205 00206 /*Set sum to zero */ 00207 sum0 = 0; 00208 00209 /* Initialize state pointer */ 00210 px = pState; 00211 00212 /* Initialize coeff pointer */ 00213 pb = pCoeffs; 00214 00215 /* Loop unrolling. Process 4 taps at a time. */ 00216 tapCnt = numTaps >> 2; 00217 00218 /* Loop over the number of taps. Unroll by a factor of 4. 00219 ** Repeat until we've computed numTaps-4 coefficients. */ 00220 while (tapCnt > 0U) 00221 { 00222 /* Read the Read b[numTaps-1] and b[numTaps-2] coefficients */ 00223 c0 = *__SIMD32(pb)++; 00224 00225 /* Read x[n-numTaps-1] and x[n-numTaps-2]sample */ 00226 x0 = *__SIMD32(px)++; 00227 00228 /* Read the b[numTaps-3] and b[numTaps-4] coefficient */ 00229 c1 = *__SIMD32(pb)++; 00230 00231 /* Perform the multiply-accumulate */ 00232 sum0 = __SMLAD(x0, c0, sum0); 00233 00234 /* Read x[n-numTaps-2] and x[n-numTaps-3] sample */ 00235 x0 = *__SIMD32(px)++; 00236 00237 /* Perform the multiply-accumulate */ 00238 sum0 = __SMLAD(x0, c1, sum0); 00239 00240 /* Decrement the loop counter */ 00241 tapCnt--; 00242 } 00243 00244 /* If the filter length is not a multiple of 4, compute the remaining filter taps */ 00245 tapCnt = numTaps % 0x4U; 00246 00247 while (tapCnt > 0U) 00248 { 00249 /* Read coefficients */ 00250 c0 = *pb++; 00251 00252 /* Fetch 1 state variable */ 00253 x0 = *px++; 00254 00255 /* Perform the multiply-accumulate */ 00256 sum0 = __SMLAD(x0, c0, sum0); 00257 00258 /* Decrement the loop counter */ 00259 tapCnt--; 00260 } 00261 00262 /* Advance the state pointer by the decimation factor 00263 * to process the next group of decimation factor number samples */ 00264 pState = pState + S->M; 00265 00266 /* Store filter output, smlad returns the values in 2.14 format */ 00267 /* so downsacle by 15 to get output in 1.15 */ 00268 *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16)); 00269 00270 /* Decrement the loop counter */ 00271 blkCntN3--; 00272 } 00273 00274 /* Processing is complete. 00275 ** Now copy the last numTaps - 1 samples to the satrt of the state buffer. 00276 ** This prepares the state buffer for the next function call. */ 00277 00278 /* Points to the start of the state buffer */ 00279 pStateCurnt = S->pState; 00280 00281 i = (numTaps - 1U) >> 2U; 00282 00283 /* copy data */ 00284 while (i > 0U) 00285 { 00286 *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++; 00287 *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++; 00288 00289 /* Decrement the loop counter */ 00290 i--; 00291 } 00292 00293 i = (numTaps - 1U) % 0x04U; 00294 00295 /* copy data */ 00296 while (i > 0U) 00297 { 00298 *pStateCurnt++ = *pState++; 00299 00300 /* Decrement the loop counter */ 00301 i--; 00302 } 00303 } 00304 00305 #else 00306 00307 00308 void arm_fir_decimate_fast_q15( 00309 const arm_fir_decimate_instance_q15 * S, 00310 q15_t * pSrc, 00311 q15_t * pDst, 00312 uint32_t blockSize) 00313 { 00314 q15_t *pState = S->pState; /* State pointer */ 00315 q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ 00316 q15_t *pStateCurnt; /* Points to the current sample of the state */ 00317 q15_t *px; /* Temporary pointer for state buffer */ 00318 q15_t *pb; /* Temporary pointer coefficient buffer */ 00319 q15_t x0, x1, c0; /* Temporary variables to hold state and coefficient values */ 00320 q31_t sum0; /* Accumulators */ 00321 q31_t acc0, acc1; 00322 q15_t *px0, *px1; 00323 uint32_t blkCntN3; 00324 uint32_t numTaps = S->numTaps; /* Number of taps */ 00325 uint32_t i, blkCnt, tapCnt, outBlockSize = blockSize / S->M; /* Loop counters */ 00326 00327 00328 /* S->pState buffer contains previous frame (numTaps - 1) samples */ 00329 /* pStateCurnt points to the location where the new input data should be written */ 00330 pStateCurnt = S->pState + (numTaps - 1U); 00331 00332 00333 /* Total number of output samples to be computed */ 00334 blkCnt = outBlockSize / 2; 00335 blkCntN3 = outBlockSize - (2 * blkCnt); 00336 00337 while (blkCnt > 0U) 00338 { 00339 /* Copy decimation factor number of new input samples into the state buffer */ 00340 i = 2 * S->M; 00341 00342 do 00343 { 00344 *pStateCurnt++ = *pSrc++; 00345 00346 } while (--i); 00347 00348 /* Set accumulator to zero */ 00349 acc0 = 0; 00350 acc1 = 0; 00351 00352 /* Initialize state pointer */ 00353 px0 = pState; 00354 00355 px1 = pState + S->M; 00356 00357 00358 /* Initialize coeff pointer */ 00359 pb = pCoeffs; 00360 00361 /* Loop unrolling. Process 4 taps at a time. */ 00362 tapCnt = numTaps >> 2; 00363 00364 /* Loop over the number of taps. Unroll by a factor of 4. 00365 ** Repeat until we've computed numTaps-4 coefficients. */ 00366 while (tapCnt > 0U) 00367 { 00368 /* Read the Read b[numTaps-1] coefficients */ 00369 c0 = *pb++; 00370 00371 /* Read x[n-numTaps-1] for sample 0 and for sample 1 */ 00372 x0 = *px0++; 00373 x1 = *px1++; 00374 00375 /* Perform the multiply-accumulate */ 00376 acc0 += x0 * c0; 00377 acc1 += x1 * c0; 00378 00379 /* Read the b[numTaps-2] coefficient */ 00380 c0 = *pb++; 00381 00382 /* Read x[n-numTaps-2] for sample 0 and sample 1 */ 00383 x0 = *px0++; 00384 x1 = *px1++; 00385 00386 /* Perform the multiply-accumulate */ 00387 acc0 += x0 * c0; 00388 acc1 += x1 * c0; 00389 00390 /* Read the b[numTaps-3] coefficients */ 00391 c0 = *pb++; 00392 00393 /* Read x[n-numTaps-3] for sample 0 and sample 1 */ 00394 x0 = *px0++; 00395 x1 = *px1++; 00396 00397 /* Perform the multiply-accumulate */ 00398 acc0 += x0 * c0; 00399 acc1 += x1 * c0; 00400 00401 /* Read the b[numTaps-4] coefficient */ 00402 c0 = *pb++; 00403 00404 /* Read x[n-numTaps-4] for sample 0 and sample 1 */ 00405 x0 = *px0++; 00406 x1 = *px1++; 00407 00408 /* Perform the multiply-accumulate */ 00409 acc0 += x0 * c0; 00410 acc1 += x1 * c0; 00411 00412 /* Decrement the loop counter */ 00413 tapCnt--; 00414 } 00415 00416 /* If the filter length is not a multiple of 4, compute the remaining filter taps */ 00417 tapCnt = numTaps % 0x4U; 00418 00419 while (tapCnt > 0U) 00420 { 00421 /* Read coefficients */ 00422 c0 = *pb++; 00423 00424 /* Fetch 1 state variable */ 00425 x0 = *px0++; 00426 x1 = *px1++; 00427 00428 /* Perform the multiply-accumulate */ 00429 acc0 += x0 * c0; 00430 acc1 += x1 * c0; 00431 00432 /* Decrement the loop counter */ 00433 tapCnt--; 00434 } 00435 00436 /* Advance the state pointer by the decimation factor 00437 * to process the next group of decimation factor number samples */ 00438 pState = pState + S->M * 2; 00439 00440 /* Store filter output, smlad returns the values in 2.14 format */ 00441 /* so downsacle by 15 to get output in 1.15 */ 00442 00443 *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16)); 00444 *pDst++ = (q15_t) (__SSAT((acc1 >> 15), 16)); 00445 00446 00447 /* Decrement the loop counter */ 00448 blkCnt--; 00449 } 00450 00451 while (blkCntN3 > 0U) 00452 { 00453 /* Copy decimation factor number of new input samples into the state buffer */ 00454 i = S->M; 00455 00456 do 00457 { 00458 *pStateCurnt++ = *pSrc++; 00459 00460 } while (--i); 00461 00462 /*Set sum to zero */ 00463 sum0 = 0; 00464 00465 /* Initialize state pointer */ 00466 px = pState; 00467 00468 /* Initialize coeff pointer */ 00469 pb = pCoeffs; 00470 00471 /* Loop unrolling. Process 4 taps at a time. */ 00472 tapCnt = numTaps >> 2; 00473 00474 /* Loop over the number of taps. Unroll by a factor of 4. 00475 ** Repeat until we've computed numTaps-4 coefficients. */ 00476 while (tapCnt > 0U) 00477 { 00478 /* Read the Read b[numTaps-1] coefficients */ 00479 c0 = *pb++; 00480 00481 /* Read x[n-numTaps-1] and sample */ 00482 x0 = *px++; 00483 00484 /* Perform the multiply-accumulate */ 00485 sum0 += x0 * c0; 00486 00487 /* Read the b[numTaps-2] coefficient */ 00488 c0 = *pb++; 00489 00490 /* Read x[n-numTaps-2] and sample */ 00491 x0 = *px++; 00492 00493 /* Perform the multiply-accumulate */ 00494 sum0 += x0 * c0; 00495 00496 /* Read the b[numTaps-3] coefficients */ 00497 c0 = *pb++; 00498 00499 /* Read x[n-numTaps-3] sample */ 00500 x0 = *px++; 00501 00502 /* Perform the multiply-accumulate */ 00503 sum0 += x0 * c0; 00504 00505 /* Read the b[numTaps-4] coefficient */ 00506 c0 = *pb++; 00507 00508 /* Read x[n-numTaps-4] sample */ 00509 x0 = *px++; 00510 00511 /* Perform the multiply-accumulate */ 00512 sum0 += x0 * c0; 00513 00514 /* Decrement the loop counter */ 00515 tapCnt--; 00516 } 00517 00518 /* If the filter length is not a multiple of 4, compute the remaining filter taps */ 00519 tapCnt = numTaps % 0x4U; 00520 00521 while (tapCnt > 0U) 00522 { 00523 /* Read coefficients */ 00524 c0 = *pb++; 00525 00526 /* Fetch 1 state variable */ 00527 x0 = *px++; 00528 00529 /* Perform the multiply-accumulate */ 00530 sum0 += x0 * c0; 00531 00532 /* Decrement the loop counter */ 00533 tapCnt--; 00534 } 00535 00536 /* Advance the state pointer by the decimation factor 00537 * to process the next group of decimation factor number samples */ 00538 pState = pState + S->M; 00539 00540 /* Store filter output, smlad returns the values in 2.14 format */ 00541 /* so downsacle by 15 to get output in 1.15 */ 00542 *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16)); 00543 00544 /* Decrement the loop counter */ 00545 blkCntN3--; 00546 } 00547 00548 /* Processing is complete. 00549 ** Now copy the last numTaps - 1 samples to the satrt of the state buffer. 00550 ** This prepares the state buffer for the next function call. */ 00551 00552 /* Points to the start of the state buffer */ 00553 pStateCurnt = S->pState; 00554 00555 i = (numTaps - 1U) >> 2U; 00556 00557 /* copy data */ 00558 while (i > 0U) 00559 { 00560 *pStateCurnt++ = *pState++; 00561 *pStateCurnt++ = *pState++; 00562 *pStateCurnt++ = *pState++; 00563 *pStateCurnt++ = *pState++; 00564 00565 /* Decrement the loop counter */ 00566 i--; 00567 } 00568 00569 i = (numTaps - 1U) % 0x04U; 00570 00571 /* copy data */ 00572 while (i > 0U) 00573 { 00574 *pStateCurnt++ = *pState++; 00575 00576 /* Decrement the loop counter */ 00577 i--; 00578 } 00579 } 00580 00581 00582 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 00583 00584 /** 00585 * @} end of FIR_decimate group 00586 */ 00587
Generated on Tue Jul 12 2022 16:47:27 by
 1.7.2
 1.7.2