Aded CMSIS5 DSP and NN folder. Needs some work
Embed:
(wiki syntax)
Show/hide line numbers
arm_fir_decimate_q15.c
00001 /* ---------------------------------------------------------------------- 00002 * Project: CMSIS DSP Library 00003 * Title: arm_fir_decimate_q15.c 00004 * Description: Q15 FIR Decimator 00005 * 00006 * $Date: 27. January 2017 00007 * $Revision: V.1.5.1 00008 * 00009 * Target Processor: Cortex-M cores 00010 * -------------------------------------------------------------------- */ 00011 /* 00012 * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. 00013 * 00014 * SPDX-License-Identifier: Apache-2.0 00015 * 00016 * Licensed under the Apache License, Version 2.0 (the License); you may 00017 * not use this file except in compliance with the License. 00018 * You may obtain a copy of the License at 00019 * 00020 * www.apache.org/licenses/LICENSE-2.0 00021 * 00022 * Unless required by applicable law or agreed to in writing, software 00023 * distributed under the License is distributed on an AS IS BASIS, WITHOUT 00024 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00025 * See the License for the specific language governing permissions and 00026 * limitations under the License. 00027 */ 00028 00029 #include "arm_math.h" 00030 00031 /** 00032 * @ingroup groupFilters 00033 */ 00034 00035 /** 00036 * @addtogroup FIR_decimate 00037 * @{ 00038 */ 00039 00040 /** 00041 * @brief Processing function for the Q15 FIR decimator. 00042 * @param[in] *S points to an instance of the Q15 FIR decimator structure. 00043 * @param[in] *pSrc points to the block of input data. 00044 * @param[out] *pDst points to the location where the output result is written. 00045 * @param[in] blockSize number of input samples to process per call. 00046 * @return none. 00047 * 00048 * <b>Scaling and Overflow Behavior:</b> 00049 * \par 00050 * The function is implemented using a 64-bit internal accumulator. 00051 * Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result. 00052 * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format. 00053 * There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved. 00054 * After all additions have been performed, the accumulator is truncated to 34.15 format by discarding low 15 bits. 00055 * Lastly, the accumulator is saturated to yield a result in 1.15 format. 00056 * 00057 * \par 00058 * Refer to the function <code>arm_fir_decimate_fast_q15()</code> for a faster but less precise implementation of this function for Cortex-M3 and Cortex-M4. 00059 */ 00060 00061 #if defined (ARM_MATH_DSP) 00062 00063 #ifndef UNALIGNED_SUPPORT_DISABLE 00064 00065 void arm_fir_decimate_q15( 00066 const arm_fir_decimate_instance_q15 * S, 00067 q15_t * pSrc, 00068 q15_t * pDst, 00069 uint32_t blockSize) 00070 { 00071 q15_t *pState = S->pState; /* State pointer */ 00072 q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ 00073 q15_t *pStateCurnt; /* Points to the current sample of the state */ 00074 q15_t *px; /* Temporary pointer for state buffer */ 00075 q15_t *pb; /* Temporary pointer coefficient buffer */ 00076 q31_t x0, x1, c0, c1; /* Temporary variables to hold state and coefficient values */ 00077 q63_t sum0; /* Accumulators */ 00078 q63_t acc0, acc1; 00079 q15_t *px0, *px1; 00080 uint32_t blkCntN3; 00081 uint32_t numTaps = S->numTaps; /* Number of taps */ 00082 uint32_t i, blkCnt, tapCnt, outBlockSize = blockSize / S->M; /* Loop counters */ 00083 00084 00085 /* S->pState buffer contains previous frame (numTaps - 1) samples */ 00086 /* pStateCurnt points to the location where the new input data should be written */ 00087 pStateCurnt = S->pState + (numTaps - 1U); 00088 00089 00090 /* Total number of output samples to be computed */ 00091 blkCnt = outBlockSize / 2; 00092 blkCntN3 = outBlockSize - (2 * blkCnt); 00093 00094 00095 while (blkCnt > 0U) 00096 { 00097 /* Copy decimation factor number of new input samples into the state buffer */ 00098 i = 2 * S->M; 00099 00100 do 00101 { 00102 *pStateCurnt++ = *pSrc++; 00103 00104 } while (--i); 00105 00106 /* Set accumulator to zero */ 00107 acc0 = 0; 00108 acc1 = 0; 00109 00110 /* Initialize state pointer */ 00111 px0 = pState; 00112 00113 px1 = pState + S->M; 00114 00115 00116 /* Initialize coeff pointer */ 00117 pb = pCoeffs; 00118 00119 /* Loop unrolling. Process 4 taps at a time. */ 00120 tapCnt = numTaps >> 2; 00121 00122 /* Loop over the number of taps. Unroll by a factor of 4. 00123 ** Repeat until we've computed numTaps-4 coefficients. */ 00124 while (tapCnt > 0U) 00125 { 00126 /* Read the Read b[numTaps-1] and b[numTaps-2] coefficients */ 00127 c0 = *__SIMD32(pb)++; 00128 00129 /* Read x[n-numTaps-1] and x[n-numTaps-2]sample */ 00130 x0 = *__SIMD32(px0)++; 00131 00132 x1 = *__SIMD32(px1)++; 00133 00134 /* Perform the multiply-accumulate */ 00135 acc0 = __SMLALD(x0, c0, acc0); 00136 00137 acc1 = __SMLALD(x1, c0, acc1); 00138 00139 /* Read the b[numTaps-3] and b[numTaps-4] coefficient */ 00140 c0 = *__SIMD32(pb)++; 00141 00142 /* Read x[n-numTaps-2] and x[n-numTaps-3] sample */ 00143 x0 = *__SIMD32(px0)++; 00144 00145 x1 = *__SIMD32(px1)++; 00146 00147 /* Perform the multiply-accumulate */ 00148 acc0 = __SMLALD(x0, c0, acc0); 00149 00150 acc1 = __SMLALD(x1, c0, acc1); 00151 00152 /* Decrement the loop counter */ 00153 tapCnt--; 00154 } 00155 00156 /* If the filter length is not a multiple of 4, compute the remaining filter taps */ 00157 tapCnt = numTaps % 0x4U; 00158 00159 while (tapCnt > 0U) 00160 { 00161 /* Read coefficients */ 00162 c0 = *pb++; 00163 00164 /* Fetch 1 state variable */ 00165 x0 = *px0++; 00166 00167 x1 = *px1++; 00168 00169 /* Perform the multiply-accumulate */ 00170 acc0 = __SMLALD(x0, c0, acc0); 00171 acc1 = __SMLALD(x1, c0, acc1); 00172 00173 /* Decrement the loop counter */ 00174 tapCnt--; 00175 } 00176 00177 /* Advance the state pointer by the decimation factor 00178 * to process the next group of decimation factor number samples */ 00179 pState = pState + S->M * 2; 00180 00181 /* Store filter output, smlad returns the values in 2.14 format */ 00182 /* so downsacle by 15 to get output in 1.15 */ 00183 *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16)); 00184 *pDst++ = (q15_t) (__SSAT((acc1 >> 15), 16)); 00185 00186 /* Decrement the loop counter */ 00187 blkCnt--; 00188 } 00189 00190 00191 00192 while (blkCntN3 > 0U) 00193 { 00194 /* Copy decimation factor number of new input samples into the state buffer */ 00195 i = S->M; 00196 00197 do 00198 { 00199 *pStateCurnt++ = *pSrc++; 00200 00201 } while (--i); 00202 00203 /*Set sum to zero */ 00204 sum0 = 0; 00205 00206 /* Initialize state pointer */ 00207 px = pState; 00208 00209 /* Initialize coeff pointer */ 00210 pb = pCoeffs; 00211 00212 /* Loop unrolling. Process 4 taps at a time. */ 00213 tapCnt = numTaps >> 2; 00214 00215 /* Loop over the number of taps. Unroll by a factor of 4. 00216 ** Repeat until we've computed numTaps-4 coefficients. */ 00217 while (tapCnt > 0U) 00218 { 00219 /* Read the Read b[numTaps-1] and b[numTaps-2] coefficients */ 00220 c0 = *__SIMD32(pb)++; 00221 00222 /* Read x[n-numTaps-1] and x[n-numTaps-2]sample */ 00223 x0 = *__SIMD32(px)++; 00224 00225 /* Read the b[numTaps-3] and b[numTaps-4] coefficient */ 00226 c1 = *__SIMD32(pb)++; 00227 00228 /* Perform the multiply-accumulate */ 00229 sum0 = __SMLALD(x0, c0, sum0); 00230 00231 /* Read x[n-numTaps-2] and x[n-numTaps-3] sample */ 00232 x0 = *__SIMD32(px)++; 00233 00234 /* Perform the multiply-accumulate */ 00235 sum0 = __SMLALD(x0, c1, sum0); 00236 00237 /* Decrement the loop counter */ 00238 tapCnt--; 00239 } 00240 00241 /* If the filter length is not a multiple of 4, compute the remaining filter taps */ 00242 tapCnt = numTaps % 0x4U; 00243 00244 while (tapCnt > 0U) 00245 { 00246 /* Read coefficients */ 00247 c0 = *pb++; 00248 00249 /* Fetch 1 state variable */ 00250 x0 = *px++; 00251 00252 /* Perform the multiply-accumulate */ 00253 sum0 = __SMLALD(x0, c0, sum0); 00254 00255 /* Decrement the loop counter */ 00256 tapCnt--; 00257 } 00258 00259 /* Advance the state pointer by the decimation factor 00260 * to process the next group of decimation factor number samples */ 00261 pState = pState + S->M; 00262 00263 /* Store filter output, smlad returns the values in 2.14 format */ 00264 /* so downsacle by 15 to get output in 1.15 */ 00265 *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16)); 00266 00267 /* Decrement the loop counter */ 00268 blkCntN3--; 00269 } 00270 00271 /* Processing is complete. 00272 ** Now copy the last numTaps - 1 samples to the satrt of the state buffer. 00273 ** This prepares the state buffer for the next function call. */ 00274 00275 /* Points to the start of the state buffer */ 00276 pStateCurnt = S->pState; 00277 00278 i = (numTaps - 1U) >> 2U; 00279 00280 /* copy data */ 00281 while (i > 0U) 00282 { 00283 *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++; 00284 *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++; 00285 00286 /* Decrement the loop counter */ 00287 i--; 00288 } 00289 00290 i = (numTaps - 1U) % 0x04U; 00291 00292 /* copy data */ 00293 while (i > 0U) 00294 { 00295 *pStateCurnt++ = *pState++; 00296 00297 /* Decrement the loop counter */ 00298 i--; 00299 } 00300 } 00301 00302 #else 00303 00304 00305 void arm_fir_decimate_q15( 00306 const arm_fir_decimate_instance_q15 * S, 00307 q15_t * pSrc, 00308 q15_t * pDst, 00309 uint32_t blockSize) 00310 { 00311 q15_t *pState = S->pState; /* State pointer */ 00312 q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ 00313 q15_t *pStateCurnt; /* Points to the current sample of the state */ 00314 q15_t *px; /* Temporary pointer for state buffer */ 00315 q15_t *pb; /* Temporary pointer coefficient buffer */ 00316 q15_t x0, x1, c0; /* Temporary variables to hold state and coefficient values */ 00317 q63_t sum0; /* Accumulators */ 00318 q63_t acc0, acc1; 00319 q15_t *px0, *px1; 00320 uint32_t blkCntN3; 00321 uint32_t numTaps = S->numTaps; /* Number of taps */ 00322 uint32_t i, blkCnt, tapCnt, outBlockSize = blockSize / S->M; /* Loop counters */ 00323 00324 00325 /* S->pState buffer contains previous frame (numTaps - 1) samples */ 00326 /* pStateCurnt points to the location where the new input data should be written */ 00327 pStateCurnt = S->pState + (numTaps - 1U); 00328 00329 00330 /* Total number of output samples to be computed */ 00331 blkCnt = outBlockSize / 2; 00332 blkCntN3 = outBlockSize - (2 * blkCnt); 00333 00334 while (blkCnt > 0U) 00335 { 00336 /* Copy decimation factor number of new input samples into the state buffer */ 00337 i = 2 * S->M; 00338 00339 do 00340 { 00341 *pStateCurnt++ = *pSrc++; 00342 00343 } while (--i); 00344 00345 /* Set accumulator to zero */ 00346 acc0 = 0; 00347 acc1 = 0; 00348 00349 /* Initialize state pointer */ 00350 px0 = pState; 00351 00352 px1 = pState + S->M; 00353 00354 00355 /* Initialize coeff pointer */ 00356 pb = pCoeffs; 00357 00358 /* Loop unrolling. Process 4 taps at a time. */ 00359 tapCnt = numTaps >> 2; 00360 00361 /* Loop over the number of taps. Unroll by a factor of 4. 00362 ** Repeat until we've computed numTaps-4 coefficients. */ 00363 while (tapCnt > 0U) 00364 { 00365 /* Read the Read b[numTaps-1] coefficients */ 00366 c0 = *pb++; 00367 00368 /* Read x[n-numTaps-1] for sample 0 and for sample 1 */ 00369 x0 = *px0++; 00370 x1 = *px1++; 00371 00372 /* Perform the multiply-accumulate */ 00373 acc0 += x0 * c0; 00374 acc1 += x1 * c0; 00375 00376 /* Read the b[numTaps-2] coefficient */ 00377 c0 = *pb++; 00378 00379 /* Read x[n-numTaps-2] for sample 0 and sample 1 */ 00380 x0 = *px0++; 00381 x1 = *px1++; 00382 00383 /* Perform the multiply-accumulate */ 00384 acc0 += x0 * c0; 00385 acc1 += x1 * c0; 00386 00387 /* Read the b[numTaps-3] coefficients */ 00388 c0 = *pb++; 00389 00390 /* Read x[n-numTaps-3] for sample 0 and sample 1 */ 00391 x0 = *px0++; 00392 x1 = *px1++; 00393 00394 /* Perform the multiply-accumulate */ 00395 acc0 += x0 * c0; 00396 acc1 += x1 * c0; 00397 00398 /* Read the b[numTaps-4] coefficient */ 00399 c0 = *pb++; 00400 00401 /* Read x[n-numTaps-4] for sample 0 and sample 1 */ 00402 x0 = *px0++; 00403 x1 = *px1++; 00404 00405 /* Perform the multiply-accumulate */ 00406 acc0 += x0 * c0; 00407 acc1 += x1 * c0; 00408 00409 /* Decrement the loop counter */ 00410 tapCnt--; 00411 } 00412 00413 /* If the filter length is not a multiple of 4, compute the remaining filter taps */ 00414 tapCnt = numTaps % 0x4U; 00415 00416 while (tapCnt > 0U) 00417 { 00418 /* Read coefficients */ 00419 c0 = *pb++; 00420 00421 /* Fetch 1 state variable */ 00422 x0 = *px0++; 00423 x1 = *px1++; 00424 00425 /* Perform the multiply-accumulate */ 00426 acc0 += x0 * c0; 00427 acc1 += x1 * c0; 00428 00429 /* Decrement the loop counter */ 00430 tapCnt--; 00431 } 00432 00433 /* Advance the state pointer by the decimation factor 00434 * to process the next group of decimation factor number samples */ 00435 pState = pState + S->M * 2; 00436 00437 /* Store filter output, smlad returns the values in 2.14 format */ 00438 /* so downsacle by 15 to get output in 1.15 */ 00439 00440 *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16)); 00441 *pDst++ = (q15_t) (__SSAT((acc1 >> 15), 16)); 00442 00443 /* Decrement the loop counter */ 00444 blkCnt--; 00445 } 00446 00447 while (blkCntN3 > 0U) 00448 { 00449 /* Copy decimation factor number of new input samples into the state buffer */ 00450 i = S->M; 00451 00452 do 00453 { 00454 *pStateCurnt++ = *pSrc++; 00455 00456 } while (--i); 00457 00458 /*Set sum to zero */ 00459 sum0 = 0; 00460 00461 /* Initialize state pointer */ 00462 px = pState; 00463 00464 /* Initialize coeff pointer */ 00465 pb = pCoeffs; 00466 00467 /* Loop unrolling. Process 4 taps at a time. */ 00468 tapCnt = numTaps >> 2; 00469 00470 /* Loop over the number of taps. Unroll by a factor of 4. 00471 ** Repeat until we've computed numTaps-4 coefficients. */ 00472 while (tapCnt > 0U) 00473 { 00474 /* Read the Read b[numTaps-1] coefficients */ 00475 c0 = *pb++; 00476 00477 /* Read x[n-numTaps-1] and sample */ 00478 x0 = *px++; 00479 00480 /* Perform the multiply-accumulate */ 00481 sum0 += x0 * c0; 00482 00483 /* Read the b[numTaps-2] coefficient */ 00484 c0 = *pb++; 00485 00486 /* Read x[n-numTaps-2] and sample */ 00487 x0 = *px++; 00488 00489 /* Perform the multiply-accumulate */ 00490 sum0 += x0 * c0; 00491 00492 /* Read the b[numTaps-3] coefficients */ 00493 c0 = *pb++; 00494 00495 /* Read x[n-numTaps-3] sample */ 00496 x0 = *px++; 00497 00498 /* Perform the multiply-accumulate */ 00499 sum0 += x0 * c0; 00500 00501 /* Read the b[numTaps-4] coefficient */ 00502 c0 = *pb++; 00503 00504 /* Read x[n-numTaps-4] sample */ 00505 x0 = *px++; 00506 00507 /* Perform the multiply-accumulate */ 00508 sum0 += x0 * c0; 00509 00510 /* Decrement the loop counter */ 00511 tapCnt--; 00512 } 00513 00514 /* If the filter length is not a multiple of 4, compute the remaining filter taps */ 00515 tapCnt = numTaps % 0x4U; 00516 00517 while (tapCnt > 0U) 00518 { 00519 /* Read coefficients */ 00520 c0 = *pb++; 00521 00522 /* Fetch 1 state variable */ 00523 x0 = *px++; 00524 00525 /* Perform the multiply-accumulate */ 00526 sum0 += x0 * c0; 00527 00528 /* Decrement the loop counter */ 00529 tapCnt--; 00530 } 00531 00532 /* Advance the state pointer by the decimation factor 00533 * to process the next group of decimation factor number samples */ 00534 pState = pState + S->M; 00535 00536 /* Store filter output, smlad returns the values in 2.14 format */ 00537 /* so downsacle by 15 to get output in 1.15 */ 00538 *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16)); 00539 00540 /* Decrement the loop counter */ 00541 blkCntN3--; 00542 } 00543 00544 /* Processing is complete. 00545 ** Now copy the last numTaps - 1 samples to the satrt of the state buffer. 00546 ** This prepares the state buffer for the next function call. */ 00547 00548 /* Points to the start of the state buffer */ 00549 pStateCurnt = S->pState; 00550 00551 i = (numTaps - 1U) >> 2U; 00552 00553 /* copy data */ 00554 while (i > 0U) 00555 { 00556 *pStateCurnt++ = *pState++; 00557 *pStateCurnt++ = *pState++; 00558 *pStateCurnt++ = *pState++; 00559 *pStateCurnt++ = *pState++; 00560 00561 /* Decrement the loop counter */ 00562 i--; 00563 } 00564 00565 i = (numTaps - 1U) % 0x04U; 00566 00567 /* copy data */ 00568 while (i > 0U) 00569 { 00570 *pStateCurnt++ = *pState++; 00571 00572 /* Decrement the loop counter */ 00573 i--; 00574 } 00575 } 00576 00577 00578 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ 00579 00580 #else 00581 00582 00583 void arm_fir_decimate_q15( 00584 const arm_fir_decimate_instance_q15 * S, 00585 q15_t * pSrc, 00586 q15_t * pDst, 00587 uint32_t blockSize) 00588 { 00589 q15_t *pState = S->pState; /* State pointer */ 00590 q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ 00591 q15_t *pStateCurnt; /* Points to the current sample of the state */ 00592 q15_t *px; /* Temporary pointer for state buffer */ 00593 q15_t *pb; /* Temporary pointer coefficient buffer */ 00594 q31_t x0, c0; /* Temporary variables to hold state and coefficient values */ 00595 q63_t sum0; /* Accumulators */ 00596 uint32_t numTaps = S->numTaps; /* Number of taps */ 00597 uint32_t i, blkCnt, tapCnt, outBlockSize = blockSize / S->M; /* Loop counters */ 00598 00599 00600 00601 /* Run the below code for Cortex-M0 */ 00602 00603 /* S->pState buffer contains previous frame (numTaps - 1) samples */ 00604 /* pStateCurnt points to the location where the new input data should be written */ 00605 pStateCurnt = S->pState + (numTaps - 1U); 00606 00607 /* Total number of output samples to be computed */ 00608 blkCnt = outBlockSize; 00609 00610 while (blkCnt > 0U) 00611 { 00612 /* Copy decimation factor number of new input samples into the state buffer */ 00613 i = S->M; 00614 00615 do 00616 { 00617 *pStateCurnt++ = *pSrc++; 00618 00619 } while (--i); 00620 00621 /*Set sum to zero */ 00622 sum0 = 0; 00623 00624 /* Initialize state pointer */ 00625 px = pState; 00626 00627 /* Initialize coeff pointer */ 00628 pb = pCoeffs; 00629 00630 tapCnt = numTaps; 00631 00632 while (tapCnt > 0U) 00633 { 00634 /* Read coefficients */ 00635 c0 = *pb++; 00636 00637 /* Fetch 1 state variable */ 00638 x0 = *px++; 00639 00640 /* Perform the multiply-accumulate */ 00641 sum0 += (q31_t) x0 *c0; 00642 00643 /* Decrement the loop counter */ 00644 tapCnt--; 00645 } 00646 00647 /* Advance the state pointer by the decimation factor 00648 * to process the next group of decimation factor number samples */ 00649 pState = pState + S->M; 00650 00651 /*Store filter output , smlad will return the values in 2.14 format */ 00652 /* so downsacle by 15 to get output in 1.15 */ 00653 *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16)); 00654 00655 /* Decrement the loop counter */ 00656 blkCnt--; 00657 } 00658 00659 /* Processing is complete. 00660 ** Now copy the last numTaps - 1 samples to the start of the state buffer. 00661 ** This prepares the state buffer for the next function call. */ 00662 00663 /* Points to the start of the state buffer */ 00664 pStateCurnt = S->pState; 00665 00666 i = numTaps - 1U; 00667 00668 /* copy data */ 00669 while (i > 0U) 00670 { 00671 *pStateCurnt++ = *pState++; 00672 00673 /* Decrement the loop counter */ 00674 i--; 00675 } 00676 00677 00678 } 00679 #endif /* #if defined (ARM_MATH_DSP) */ 00680 00681 00682 /** 00683 * @} end of FIR_decimate group 00684 */ 00685
Generated on Tue Jul 12 2022 16:47:27 by 1.7.2