CMSIS DSP library
Dependents: performance_timer Surfboard_ gps2rtty Capstone ... more
arm_fir_sparse_q15.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010-2014 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 19. March 2015 00005 * $Revision: V.1.4.5 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_fir_sparse_q15.c 00009 * 00010 * Description: Q15 sparse FIR filter processing function. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * ------------------------------------------------------------------- */ 00040 #include "arm_math.h" 00041 00042 /** 00043 * @addtogroup FIR_Sparse 00044 * @{ 00045 */ 00046 00047 /** 00048 * @brief Processing function for the Q15 sparse FIR filter. 00049 * @param[in] *S points to an instance of the Q15 sparse FIR structure. 00050 * @param[in] *pSrc points to the block of input data. 00051 * @param[out] *pDst points to the block of output data 00052 * @param[in] *pScratchIn points to a temporary buffer of size blockSize. 00053 * @param[in] *pScratchOut points to a temporary buffer of size blockSize. 00054 * @param[in] blockSize number of input samples to process per call. 00055 * @return none. 00056 * 00057 * <b>Scaling and Overflow Behavior:</b> 00058 * \par 00059 * The function is implemented using an internal 32-bit accumulator. 00060 * The 1.15 x 1.15 multiplications yield a 2.30 result and these are added to a 2.30 accumulator. 00061 * Thus the full precision of the multiplications is maintained but there is only a single guard bit in the accumulator. 00062 * If the accumulator result overflows it will wrap around rather than saturate. 00063 * After all multiply-accumulates are performed, the 2.30 accumulator is truncated to 2.15 format and then saturated to 1.15 format. 00064 * In order to avoid overflows the input signal or coefficients must be scaled down by log2(numTaps) bits. 00065 */ 00066 00067 00068 void arm_fir_sparse_q15( 00069 arm_fir_sparse_instance_q15 * S, 00070 q15_t * pSrc, 00071 q15_t * pDst, 00072 q15_t * pScratchIn, 00073 q31_t * pScratchOut, 00074 uint32_t blockSize) 00075 { 00076 00077 q15_t *pState = S->pState; /* State pointer */ 00078 q15_t *pIn = pSrc; /* Working pointer for input */ 00079 q15_t *pOut = pDst; /* Working pointer for output */ 00080 q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ 00081 q15_t *px; /* Temporary pointers for scratch buffer */ 00082 q15_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */ 00083 q15_t *py = pState; /* Temporary pointers for state buffer */ 00084 int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset of the non-zero tap values. */ 00085 uint32_t delaySize = S->maxDelay + blockSize; /* state length */ 00086 uint16_t numTaps = S->numTaps; /* Filter order */ 00087 int32_t readIndex; /* Read index of the state buffer */ 00088 uint32_t tapCnt, blkCnt; /* loop counters */ 00089 q15_t coeff = *pCoeffs++; /* Read the first coefficient value */ 00090 q31_t *pScr2 = pScratchOut; /* Working pointer for pScratchOut */ 00091 00092 00093 #ifndef ARM_MATH_CM0_FAMILY 00094 00095 /* Run the below code for Cortex-M4 and Cortex-M3 */ 00096 00097 q31_t in1, in2; /* Temporary variables */ 00098 00099 00100 /* BlockSize of Input samples are copied into the state buffer */ 00101 /* StateIndex points to the starting position to write in the state buffer */ 00102 arm_circularWrite_q15(py, delaySize, &S->stateIndex, 1, pIn, 1, blockSize); 00103 00104 /* Loop over the number of taps. */ 00105 tapCnt = numTaps; 00106 00107 /* Read Index, from where the state buffer should be read, is calculated. */ 00108 readIndex = (S->stateIndex - blockSize) - *pTapDelay++; 00109 00110 /* Wraparound of readIndex */ 00111 if(readIndex < 0) 00112 { 00113 readIndex += (int32_t) delaySize; 00114 } 00115 00116 /* Working pointer for state buffer is updated */ 00117 py = pState; 00118 00119 /* blockSize samples are read from the state buffer */ 00120 arm_circularRead_q15(py, delaySize, &readIndex, 1, 00121 pb, pb, blockSize, 1, blockSize); 00122 00123 /* Working pointer for the scratch buffer of state values */ 00124 px = pb; 00125 00126 /* Working pointer for scratch buffer of output values */ 00127 pScratchOut = pScr2; 00128 00129 /* Loop over the blockSize. Unroll by a factor of 4. 00130 * Compute 4 multiplications at a time. */ 00131 blkCnt = blockSize >> 2; 00132 00133 while(blkCnt > 0u) 00134 { 00135 /* Perform multiplication and store in the scratch buffer */ 00136 *pScratchOut++ = ((q31_t) * px++ * coeff); 00137 *pScratchOut++ = ((q31_t) * px++ * coeff); 00138 *pScratchOut++ = ((q31_t) * px++ * coeff); 00139 *pScratchOut++ = ((q31_t) * px++ * coeff); 00140 00141 /* Decrement the loop counter */ 00142 blkCnt--; 00143 } 00144 00145 /* If the blockSize is not a multiple of 4, 00146 * compute the remaining samples */ 00147 blkCnt = blockSize % 0x4u; 00148 00149 while(blkCnt > 0u) 00150 { 00151 /* Perform multiplication and store in the scratch buffer */ 00152 *pScratchOut++ = ((q31_t) * px++ * coeff); 00153 00154 /* Decrement the loop counter */ 00155 blkCnt--; 00156 } 00157 00158 /* Load the coefficient value and 00159 * increment the coefficient buffer for the next set of state values */ 00160 coeff = *pCoeffs++; 00161 00162 /* Read Index, from where the state buffer should be read, is calculated. */ 00163 readIndex = (S->stateIndex - blockSize) - *pTapDelay++; 00164 00165 /* Wraparound of readIndex */ 00166 if(readIndex < 0) 00167 { 00168 readIndex += (int32_t) delaySize; 00169 } 00170 00171 /* Loop over the number of taps. */ 00172 tapCnt = (uint32_t) numTaps - 2u; 00173 00174 while(tapCnt > 0u) 00175 { 00176 /* Working pointer for state buffer is updated */ 00177 py = pState; 00178 00179 /* blockSize samples are read from the state buffer */ 00180 arm_circularRead_q15(py, delaySize, &readIndex, 1, 00181 pb, pb, blockSize, 1, blockSize); 00182 00183 /* Working pointer for the scratch buffer of state values */ 00184 px = pb; 00185 00186 /* Working pointer for scratch buffer of output values */ 00187 pScratchOut = pScr2; 00188 00189 /* Loop over the blockSize. Unroll by a factor of 4. 00190 * Compute 4 MACS at a time. */ 00191 blkCnt = blockSize >> 2; 00192 00193 while(blkCnt > 0u) 00194 { 00195 /* Perform Multiply-Accumulate */ 00196 *pScratchOut++ += (q31_t) * px++ * coeff; 00197 *pScratchOut++ += (q31_t) * px++ * coeff; 00198 *pScratchOut++ += (q31_t) * px++ * coeff; 00199 *pScratchOut++ += (q31_t) * px++ * coeff; 00200 00201 /* Decrement the loop counter */ 00202 blkCnt--; 00203 } 00204 00205 /* If the blockSize is not a multiple of 4, 00206 * compute the remaining samples */ 00207 blkCnt = blockSize % 0x4u; 00208 00209 while(blkCnt > 0u) 00210 { 00211 /* Perform Multiply-Accumulate */ 00212 *pScratchOut++ += (q31_t) * px++ * coeff; 00213 00214 /* Decrement the loop counter */ 00215 blkCnt--; 00216 } 00217 00218 /* Load the coefficient value and 00219 * increment the coefficient buffer for the next set of state values */ 00220 coeff = *pCoeffs++; 00221 00222 /* Read Index, from where the state buffer should be read, is calculated. */ 00223 readIndex = (S->stateIndex - blockSize) - *pTapDelay++; 00224 00225 /* Wraparound of readIndex */ 00226 if(readIndex < 0) 00227 { 00228 readIndex += (int32_t) delaySize; 00229 } 00230 00231 /* Decrement the tap loop counter */ 00232 tapCnt--; 00233 } 00234 00235 /* Compute last tap without the final read of pTapDelay */ 00236 00237 /* Working pointer for state buffer is updated */ 00238 py = pState; 00239 00240 /* blockSize samples are read from the state buffer */ 00241 arm_circularRead_q15(py, delaySize, &readIndex, 1, 00242 pb, pb, blockSize, 1, blockSize); 00243 00244 /* Working pointer for the scratch buffer of state values */ 00245 px = pb; 00246 00247 /* Working pointer for scratch buffer of output values */ 00248 pScratchOut = pScr2; 00249 00250 /* Loop over the blockSize. Unroll by a factor of 4. 00251 * Compute 4 MACS at a time. */ 00252 blkCnt = blockSize >> 2; 00253 00254 while(blkCnt > 0u) 00255 { 00256 /* Perform Multiply-Accumulate */ 00257 *pScratchOut++ += (q31_t) * px++ * coeff; 00258 *pScratchOut++ += (q31_t) * px++ * coeff; 00259 *pScratchOut++ += (q31_t) * px++ * coeff; 00260 *pScratchOut++ += (q31_t) * px++ * coeff; 00261 00262 /* Decrement the loop counter */ 00263 blkCnt--; 00264 } 00265 00266 /* If the blockSize is not a multiple of 4, 00267 * compute the remaining samples */ 00268 blkCnt = blockSize % 0x4u; 00269 00270 while(blkCnt > 0u) 00271 { 00272 /* Perform Multiply-Accumulate */ 00273 *pScratchOut++ += (q31_t) * px++ * coeff; 00274 00275 /* Decrement the loop counter */ 00276 blkCnt--; 00277 } 00278 00279 /* All the output values are in pScratchOut buffer. 00280 Convert them into 1.15 format, saturate and store in the destination buffer. */ 00281 /* Loop over the blockSize. */ 00282 blkCnt = blockSize >> 2; 00283 00284 while(blkCnt > 0u) 00285 { 00286 in1 = *pScr2++; 00287 in2 = *pScr2++; 00288 00289 #ifndef ARM_MATH_BIG_ENDIAN 00290 00291 *__SIMD32(pOut)++ = 00292 __PKHBT((q15_t) __SSAT(in1 >> 15, 16), (q15_t) __SSAT(in2 >> 15, 16), 00293 16); 00294 00295 #else 00296 *__SIMD32(pOut)++ = 00297 __PKHBT((q15_t) __SSAT(in2 >> 15, 16), (q15_t) __SSAT(in1 >> 15, 16), 00298 16); 00299 00300 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00301 00302 in1 = *pScr2++; 00303 00304 in2 = *pScr2++; 00305 00306 #ifndef ARM_MATH_BIG_ENDIAN 00307 00308 *__SIMD32(pOut)++ = 00309 __PKHBT((q15_t) __SSAT(in1 >> 15, 16), (q15_t) __SSAT(in2 >> 15, 16), 00310 16); 00311 00312 #else 00313 00314 *__SIMD32(pOut)++ = 00315 __PKHBT((q15_t) __SSAT(in2 >> 15, 16), (q15_t) __SSAT(in1 >> 15, 16), 00316 16); 00317 00318 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00319 00320 00321 blkCnt--; 00322 00323 } 00324 00325 /* If the blockSize is not a multiple of 4, 00326 remaining samples are processed in the below loop */ 00327 blkCnt = blockSize % 0x4u; 00328 00329 while(blkCnt > 0u) 00330 { 00331 *pOut++ = (q15_t) __SSAT(*pScr2++ >> 15, 16); 00332 blkCnt--; 00333 } 00334 00335 #else 00336 00337 /* Run the below code for Cortex-M0 */ 00338 00339 /* BlockSize of Input samples are copied into the state buffer */ 00340 /* StateIndex points to the starting position to write in the state buffer */ 00341 arm_circularWrite_q15(py, delaySize, &S->stateIndex, 1, pIn, 1, blockSize); 00342 00343 /* Loop over the number of taps. */ 00344 tapCnt = numTaps; 00345 00346 /* Read Index, from where the state buffer should be read, is calculated. */ 00347 readIndex = (S->stateIndex - blockSize) - *pTapDelay++; 00348 00349 /* Wraparound of readIndex */ 00350 if(readIndex < 0) 00351 { 00352 readIndex += (int32_t) delaySize; 00353 } 00354 00355 /* Working pointer for state buffer is updated */ 00356 py = pState; 00357 00358 /* blockSize samples are read from the state buffer */ 00359 arm_circularRead_q15(py, delaySize, &readIndex, 1, 00360 pb, pb, blockSize, 1, blockSize); 00361 00362 /* Working pointer for the scratch buffer of state values */ 00363 px = pb; 00364 00365 /* Working pointer for scratch buffer of output values */ 00366 pScratchOut = pScr2; 00367 00368 blkCnt = blockSize; 00369 00370 while(blkCnt > 0u) 00371 { 00372 /* Perform multiplication and store in the scratch buffer */ 00373 *pScratchOut++ = ((q31_t) * px++ * coeff); 00374 00375 /* Decrement the loop counter */ 00376 blkCnt--; 00377 } 00378 00379 /* Load the coefficient value and 00380 * increment the coefficient buffer for the next set of state values */ 00381 coeff = *pCoeffs++; 00382 00383 /* Read Index, from where the state buffer should be read, is calculated. */ 00384 readIndex = (S->stateIndex - blockSize) - *pTapDelay++; 00385 00386 /* Wraparound of readIndex */ 00387 if(readIndex < 0) 00388 { 00389 readIndex += (int32_t) delaySize; 00390 } 00391 00392 /* Loop over the number of taps. */ 00393 tapCnt = (uint32_t) numTaps - 2u; 00394 00395 while(tapCnt > 0u) 00396 { 00397 /* Working pointer for state buffer is updated */ 00398 py = pState; 00399 00400 /* blockSize samples are read from the state buffer */ 00401 arm_circularRead_q15(py, delaySize, &readIndex, 1, 00402 pb, pb, blockSize, 1, blockSize); 00403 00404 /* Working pointer for the scratch buffer of state values */ 00405 px = pb; 00406 00407 /* Working pointer for scratch buffer of output values */ 00408 pScratchOut = pScr2; 00409 00410 blkCnt = blockSize; 00411 00412 while(blkCnt > 0u) 00413 { 00414 /* Perform Multiply-Accumulate */ 00415 *pScratchOut++ += (q31_t) * px++ * coeff; 00416 00417 /* Decrement the loop counter */ 00418 blkCnt--; 00419 } 00420 00421 /* Load the coefficient value and 00422 * increment the coefficient buffer for the next set of state values */ 00423 coeff = *pCoeffs++; 00424 00425 /* Read Index, from where the state buffer should be read, is calculated. */ 00426 readIndex = (S->stateIndex - blockSize) - *pTapDelay++; 00427 00428 /* Wraparound of readIndex */ 00429 if(readIndex < 0) 00430 { 00431 readIndex += (int32_t) delaySize; 00432 } 00433 00434 /* Decrement the tap loop counter */ 00435 tapCnt--; 00436 } 00437 00438 /* Compute last tap without the final read of pTapDelay */ 00439 00440 /* Working pointer for state buffer is updated */ 00441 py = pState; 00442 00443 /* blockSize samples are read from the state buffer */ 00444 arm_circularRead_q15(py, delaySize, &readIndex, 1, 00445 pb, pb, blockSize, 1, blockSize); 00446 00447 /* Working pointer for the scratch buffer of state values */ 00448 px = pb; 00449 00450 /* Working pointer for scratch buffer of output values */ 00451 pScratchOut = pScr2; 00452 00453 blkCnt = blockSize; 00454 00455 while(blkCnt > 0u) 00456 { 00457 /* Perform Multiply-Accumulate */ 00458 *pScratchOut++ += (q31_t) * px++ * coeff; 00459 00460 /* Decrement the loop counter */ 00461 blkCnt--; 00462 } 00463 00464 /* All the output values are in pScratchOut buffer. 00465 Convert them into 1.15 format, saturate and store in the destination buffer. */ 00466 /* Loop over the blockSize. */ 00467 blkCnt = blockSize; 00468 00469 while(blkCnt > 0u) 00470 { 00471 *pOut++ = (q15_t) __SSAT(*pScr2++ >> 15, 16); 00472 blkCnt--; 00473 } 00474 00475 #endif /* #ifndef ARM_MATH_CM0_FAMILY */ 00476 00477 } 00478 00479 /** 00480 * @} end of FIR_Sparse group 00481 */
Generated on Tue Jul 12 2022 11:59:17 by 1.7.2