Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
Fork of OmniWheels by
arm_fir_sparse_q15.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010-2014 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 19. March 2015 00005 * $Revision: V.1.4.5 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_fir_sparse_q15.c 00009 * 00010 * Description: Q15 sparse FIR filter processing function. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * ------------------------------------------------------------------- */ 00040 #include "arm_math.h" 00041 00042 /** 00043 * @addtogroup FIR_Sparse 00044 * @{ 00045 */ 00046 00047 /** 00048 * @brief Processing function for the Q15 sparse FIR filter. 00049 * @param[in] *S points to an instance of the Q15 sparse FIR structure. 00050 * @param[in] *pSrc points to the block of input data. 00051 * @param[out] *pDst points to the block of output data 00052 * @param[in] *pScratchIn points to a temporary buffer of size blockSize. 00053 * @param[in] *pScratchOut points to a temporary buffer of size blockSize. 00054 * @param[in] blockSize number of input samples to process per call. 00055 * @return none. 00056 * 00057 * <b>Scaling and Overflow Behavior:</b> 00058 * \par 00059 * The function is implemented using an internal 32-bit accumulator. 00060 * The 1.15 x 1.15 multiplications yield a 2.30 result and these are added to a 2.30 accumulator. 00061 * Thus the full precision of the multiplications is maintained but there is only a single guard bit in the accumulator. 00062 * If the accumulator result overflows it will wrap around rather than saturate. 00063 * After all multiply-accumulates are performed, the 2.30 accumulator is truncated to 2.15 format and then saturated to 1.15 format. 00064 * In order to avoid overflows the input signal or coefficients must be scaled down by log2(numTaps) bits. 00065 */ 00066 00067 00068 void arm_fir_sparse_q15( 00069 arm_fir_sparse_instance_q15 * S, 00070 q15_t * pSrc, 00071 q15_t * pDst, 00072 q15_t * pScratchIn, 00073 q31_t * pScratchOut, 00074 uint32_t blockSize) 00075 { 00076 00077 q15_t *pState = S->pState; /* State pointer */ 00078 q15_t *pIn = pSrc; /* Working pointer for input */ 00079 q15_t *pOut = pDst; /* Working pointer for output */ 00080 q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ 00081 q15_t *px; /* Temporary pointers for scratch buffer */ 00082 q15_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */ 00083 q15_t *py = pState; /* Temporary pointers for state buffer */ 00084 int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset of the non-zero tap values. */ 00085 uint32_t delaySize = S->maxDelay + blockSize; /* state length */ 00086 uint16_t numTaps = S->numTaps; /* Filter order */ 00087 int32_t readIndex; /* Read index of the state buffer */ 00088 uint32_t tapCnt, blkCnt; /* loop counters */ 00089 q15_t coeff = *pCoeffs++; /* Read the first coefficient value */ 00090 q31_t *pScr2 = pScratchOut; /* Working pointer for pScratchOut */ 00091 00092 00093 #ifndef ARM_MATH_CM0_FAMILY 00094 00095 /* Run the below code for Cortex-M4 and Cortex-M3 */ 00096 00097 q31_t in1, in2; /* Temporary variables */ 00098 00099 00100 /* BlockSize of Input samples are copied into the state buffer */ 00101 /* StateIndex points to the starting position to write in the state buffer */ 00102 arm_circularWrite_q15(py, delaySize, &S->stateIndex, 1, pIn, 1, blockSize); 00103 00104 /* Loop over the number of taps. */ 00105 tapCnt = numTaps; 00106 00107 /* Read Index, from where the state buffer should be read, is calculated. */ 00108 readIndex = (S->stateIndex - blockSize) - *pTapDelay++; 00109 00110 /* Wraparound of readIndex */ 00111 if(readIndex < 0) 00112 { 00113 readIndex += (int32_t) delaySize; 00114 } 00115 00116 /* Working pointer for state buffer is updated */ 00117 py = pState; 00118 00119 /* blockSize samples are read from the state buffer */ 00120 arm_circularRead_q15(py, delaySize, &readIndex, 1, 00121 pb, pb, blockSize, 1, blockSize); 00122 00123 /* Working pointer for the scratch buffer of state values */ 00124 px = pb; 00125 00126 /* Working pointer for scratch buffer of output values */ 00127 pScratchOut = pScr2; 00128 00129 /* Loop over the blockSize. Unroll by a factor of 4. 00130 * Compute 4 multiplications at a time. */ 00131 blkCnt = blockSize >> 2; 00132 00133 while(blkCnt > 0u) 00134 { 00135 /* Perform multiplication and store in the scratch buffer */ 00136 *pScratchOut++ = ((q31_t) * px++ * coeff); 00137 *pScratchOut++ = ((q31_t) * px++ * coeff); 00138 *pScratchOut++ = ((q31_t) * px++ * coeff); 00139 *pScratchOut++ = ((q31_t) * px++ * coeff); 00140 00141 /* Decrement the loop counter */ 00142 blkCnt--; 00143 } 00144 00145 /* If the blockSize is not a multiple of 4, 00146 * compute the remaining samples */ 00147 blkCnt = blockSize % 0x4u; 00148 00149 while(blkCnt > 0u) 00150 { 00151 /* Perform multiplication and store in the scratch buffer */ 00152 *pScratchOut++ = ((q31_t) * px++ * coeff); 00153 00154 /* Decrement the loop counter */ 00155 blkCnt--; 00156 } 00157 00158 /* Load the coefficient value and 00159 * increment the coefficient buffer for the next set of state values */ 00160 coeff = *pCoeffs++; 00161 00162 /* Read Index, from where the state buffer should be read, is calculated. */ 00163 readIndex = (S->stateIndex - blockSize) - *pTapDelay++; 00164 00165 /* Wraparound of readIndex */ 00166 if(readIndex < 0) 00167 { 00168 readIndex += (int32_t) delaySize; 00169 } 00170 00171 /* Loop over the number of taps. */ 00172 tapCnt = (uint32_t) numTaps - 2u; 00173 00174 while(tapCnt > 0u) 00175 { 00176 /* Working pointer for state buffer is updated */ 00177 py = pState; 00178 00179 /* blockSize samples are read from the state buffer */ 00180 arm_circularRead_q15(py, delaySize, &readIndex, 1, 00181 pb, pb, blockSize, 1, blockSize); 00182 00183 /* Working pointer for the scratch buffer of state values */ 00184 px = pb; 00185 00186 /* Working pointer for scratch buffer of output values */ 00187 pScratchOut = pScr2; 00188 00189 /* Loop over the blockSize. Unroll by a factor of 4. 00190 * Compute 4 MACS at a time. */ 00191 blkCnt = blockSize >> 2; 00192 00193 while(blkCnt > 0u) 00194 { 00195 /* Perform Multiply-Accumulate */ 00196 *pScratchOut++ += (q31_t) * px++ * coeff; 00197 *pScratchOut++ += (q31_t) * px++ * coeff; 00198 *pScratchOut++ += (q31_t) * px++ * coeff; 00199 *pScratchOut++ += (q31_t) * px++ * coeff; 00200 00201 /* Decrement the loop counter */ 00202 blkCnt--; 00203 } 00204 00205 /* If the blockSize is not a multiple of 4, 00206 * compute the remaining samples */ 00207 blkCnt = blockSize % 0x4u; 00208 00209 while(blkCnt > 0u) 00210 { 00211 /* Perform Multiply-Accumulate */ 00212 *pScratchOut++ += (q31_t) * px++ * coeff; 00213 00214 /* Decrement the loop counter */ 00215 blkCnt--; 00216 } 00217 00218 /* Load the coefficient value and 00219 * increment the coefficient buffer for the next set of state values */ 00220 coeff = *pCoeffs++; 00221 00222 /* Read Index, from where the state buffer should be read, is calculated. */ 00223 readIndex = (S->stateIndex - blockSize) - *pTapDelay++; 00224 00225 /* Wraparound of readIndex */ 00226 if(readIndex < 0) 00227 { 00228 readIndex += (int32_t) delaySize; 00229 } 00230 00231 /* Decrement the tap loop counter */ 00232 tapCnt--; 00233 } 00234 00235 /* Compute last tap without the final read of pTapDelay */ 00236 00237 /* Working pointer for state buffer is updated */ 00238 py = pState; 00239 00240 /* blockSize samples are read from the state buffer */ 00241 arm_circularRead_q15(py, delaySize, &readIndex, 1, 00242 pb, pb, blockSize, 1, blockSize); 00243 00244 /* Working pointer for the scratch buffer of state values */ 00245 px = pb; 00246 00247 /* Working pointer for scratch buffer of output values */ 00248 pScratchOut = pScr2; 00249 00250 /* Loop over the blockSize. Unroll by a factor of 4. 00251 * Compute 4 MACS at a time. */ 00252 blkCnt = blockSize >> 2; 00253 00254 while(blkCnt > 0u) 00255 { 00256 /* Perform Multiply-Accumulate */ 00257 *pScratchOut++ += (q31_t) * px++ * coeff; 00258 *pScratchOut++ += (q31_t) * px++ * coeff; 00259 *pScratchOut++ += (q31_t) * px++ * coeff; 00260 *pScratchOut++ += (q31_t) * px++ * coeff; 00261 00262 /* Decrement the loop counter */ 00263 blkCnt--; 00264 } 00265 00266 /* If the blockSize is not a multiple of 4, 00267 * compute the remaining samples */ 00268 blkCnt = blockSize % 0x4u; 00269 00270 while(blkCnt > 0u) 00271 { 00272 /* Perform Multiply-Accumulate */ 00273 *pScratchOut++ += (q31_t) * px++ * coeff; 00274 00275 /* Decrement the loop counter */ 00276 blkCnt--; 00277 } 00278 00279 /* All the output values are in pScratchOut buffer. 00280 Convert them into 1.15 format, saturate and store in the destination buffer. */ 00281 /* Loop over the blockSize. */ 00282 blkCnt = blockSize >> 2; 00283 00284 while(blkCnt > 0u) 00285 { 00286 in1 = *pScr2++; 00287 in2 = *pScr2++; 00288 00289 #ifndef ARM_MATH_BIG_ENDIAN 00290 00291 *__SIMD32(pOut)++ = 00292 __PKHBT((q15_t) __SSAT(in1 >> 15, 16), (q15_t) __SSAT(in2 >> 15, 16), 00293 16); 00294 00295 #else 00296 *__SIMD32(pOut)++ = 00297 __PKHBT((q15_t) __SSAT(in2 >> 15, 16), (q15_t) __SSAT(in1 >> 15, 16), 00298 16); 00299 00300 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00301 00302 in1 = *pScr2++; 00303 00304 in2 = *pScr2++; 00305 00306 #ifndef ARM_MATH_BIG_ENDIAN 00307 00308 *__SIMD32(pOut)++ = 00309 __PKHBT((q15_t) __SSAT(in1 >> 15, 16), (q15_t) __SSAT(in2 >> 15, 16), 00310 16); 00311 00312 #else 00313 00314 *__SIMD32(pOut)++ = 00315 __PKHBT((q15_t) __SSAT(in2 >> 15, 16), (q15_t) __SSAT(in1 >> 15, 16), 00316 16); 00317 00318 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00319 00320 00321 blkCnt--; 00322 00323 } 00324 00325 /* If the blockSize is not a multiple of 4, 00326 remaining samples are processed in the below loop */ 00327 blkCnt = blockSize % 0x4u; 00328 00329 while(blkCnt > 0u) 00330 { 00331 *pOut++ = (q15_t) __SSAT(*pScr2++ >> 15, 16); 00332 blkCnt--; 00333 } 00334 00335 #else 00336 00337 /* Run the below code for Cortex-M0 */ 00338 00339 /* BlockSize of Input samples are copied into the state buffer */ 00340 /* StateIndex points to the starting position to write in the state buffer */ 00341 arm_circularWrite_q15(py, delaySize, &S->stateIndex, 1, pIn, 1, blockSize); 00342 00343 /* Loop over the number of taps. */ 00344 tapCnt = numTaps; 00345 00346 /* Read Index, from where the state buffer should be read, is calculated. */ 00347 readIndex = (S->stateIndex - blockSize) - *pTapDelay++; 00348 00349 /* Wraparound of readIndex */ 00350 if(readIndex < 0) 00351 { 00352 readIndex += (int32_t) delaySize; 00353 } 00354 00355 /* Working pointer for state buffer is updated */ 00356 py = pState; 00357 00358 /* blockSize samples are read from the state buffer */ 00359 arm_circularRead_q15(py, delaySize, &readIndex, 1, 00360 pb, pb, blockSize, 1, blockSize); 00361 00362 /* Working pointer for the scratch buffer of state values */ 00363 px = pb; 00364 00365 /* Working pointer for scratch buffer of output values */ 00366 pScratchOut = pScr2; 00367 00368 blkCnt = blockSize; 00369 00370 while(blkCnt > 0u) 00371 { 00372 /* Perform multiplication and store in the scratch buffer */ 00373 *pScratchOut++ = ((q31_t) * px++ * coeff); 00374 00375 /* Decrement the loop counter */ 00376 blkCnt--; 00377 } 00378 00379 /* Load the coefficient value and 00380 * increment the coefficient buffer for the next set of state values */ 00381 coeff = *pCoeffs++; 00382 00383 /* Read Index, from where the state buffer should be read, is calculated. */ 00384 readIndex = (S->stateIndex - blockSize) - *pTapDelay++; 00385 00386 /* Wraparound of readIndex */ 00387 if(readIndex < 0) 00388 { 00389 readIndex += (int32_t) delaySize; 00390 } 00391 00392 /* Loop over the number of taps. */ 00393 tapCnt = (uint32_t) numTaps - 2u; 00394 00395 while(tapCnt > 0u) 00396 { 00397 /* Working pointer for state buffer is updated */ 00398 py = pState; 00399 00400 /* blockSize samples are read from the state buffer */ 00401 arm_circularRead_q15(py, delaySize, &readIndex, 1, 00402 pb, pb, blockSize, 1, blockSize); 00403 00404 /* Working pointer for the scratch buffer of state values */ 00405 px = pb; 00406 00407 /* Working pointer for scratch buffer of output values */ 00408 pScratchOut = pScr2; 00409 00410 blkCnt = blockSize; 00411 00412 while(blkCnt > 0u) 00413 { 00414 /* Perform Multiply-Accumulate */ 00415 *pScratchOut++ += (q31_t) * px++ * coeff; 00416 00417 /* Decrement the loop counter */ 00418 blkCnt--; 00419 } 00420 00421 /* Load the coefficient value and 00422 * increment the coefficient buffer for the next set of state values */ 00423 coeff = *pCoeffs++; 00424 00425 /* Read Index, from where the state buffer should be read, is calculated. */ 00426 readIndex = (S->stateIndex - blockSize) - *pTapDelay++; 00427 00428 /* Wraparound of readIndex */ 00429 if(readIndex < 0) 00430 { 00431 readIndex += (int32_t) delaySize; 00432 } 00433 00434 /* Decrement the tap loop counter */ 00435 tapCnt--; 00436 } 00437 00438 /* Compute last tap without the final read of pTapDelay */ 00439 00440 /* Working pointer for state buffer is updated */ 00441 py = pState; 00442 00443 /* blockSize samples are read from the state buffer */ 00444 arm_circularRead_q15(py, delaySize, &readIndex, 1, 00445 pb, pb, blockSize, 1, blockSize); 00446 00447 /* Working pointer for the scratch buffer of state values */ 00448 px = pb; 00449 00450 /* Working pointer for scratch buffer of output values */ 00451 pScratchOut = pScr2; 00452 00453 blkCnt = blockSize; 00454 00455 while(blkCnt > 0u) 00456 { 00457 /* Perform Multiply-Accumulate */ 00458 *pScratchOut++ += (q31_t) * px++ * coeff; 00459 00460 /* Decrement the loop counter */ 00461 blkCnt--; 00462 } 00463 00464 /* All the output values are in pScratchOut buffer. 00465 Convert them into 1.15 format, saturate and store in the destination buffer. */ 00466 /* Loop over the blockSize. */ 00467 blkCnt = blockSize; 00468 00469 while(blkCnt > 0u) 00470 { 00471 *pOut++ = (q15_t) __SSAT(*pScr2++ >> 15, 16); 00472 blkCnt--; 00473 } 00474 00475 #endif /* #ifndef ARM_MATH_CM0_FAMILY */ 00476 00477 } 00478 00479 /** 00480 * @} end of FIR_Sparse group 00481 */
Generated on Fri Jul 22 2022 04:53:44 by
 1.7.2 
    