CMSIS DSP library
Dependents: KL25Z_FFT_Demo Hat_Board_v5_1 KL25Z_FFT_Demo_tony KL25Z_FFT_Demo_tony ... more
Fork of mbed-dsp by
arm_fir_sparse_q15.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010-2013 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 17. January 2013 00005 * $Revision: V1.4.1 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_fir_sparse_q15.c 00009 * 00010 * Description: Q15 sparse FIR filter processing function. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * ------------------------------------------------------------------- */ 00040 #include "arm_math.h" 00041 00042 /** 00043 * @addtogroup FIR_Sparse 00044 * @{ 00045 */ 00046 00047 /** 00048 * @brief Processing function for the Q15 sparse FIR filter. 00049 * @param[in] *S points to an instance of the Q15 sparse FIR structure. 00050 * @param[in] *pSrc points to the block of input data. 00051 * @param[out] *pDst points to the block of output data 00052 * @param[in] *pScratchIn points to a temporary buffer of size blockSize. 00053 * @param[in] *pScratchOut points to a temporary buffer of size blockSize. 00054 * @param[in] blockSize number of input samples to process per call. 00055 * @return none. 00056 * 00057 * <b>Scaling and Overflow Behavior:</b> 00058 * \par 00059 * The function is implemented using an internal 32-bit accumulator. 00060 * The 1.15 x 1.15 multiplications yield a 2.30 result and these are added to a 2.30 accumulator. 00061 * Thus the full precision of the multiplications is maintained but there is only a single guard bit in the accumulator. 00062 * If the accumulator result overflows it will wrap around rather than saturate. 00063 * After all multiply-accumulates are performed, the 2.30 accumulator is truncated to 2.15 format and then saturated to 1.15 format. 00064 * In order to avoid overflows the input signal or coefficients must be scaled down by log2(numTaps) bits. 00065 */ 00066 00067 00068 void arm_fir_sparse_q15( 00069 arm_fir_sparse_instance_q15 * S, 00070 q15_t * pSrc, 00071 q15_t * pDst, 00072 q15_t * pScratchIn, 00073 q31_t * pScratchOut, 00074 uint32_t blockSize) 00075 { 00076 00077 q15_t *pState = S->pState; /* State pointer */ 00078 q15_t *pIn = pSrc; /* Working pointer for input */ 00079 q15_t *pOut = pDst; /* Working pointer for output */ 00080 q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ 00081 q15_t *px; /* Temporary pointers for scratch buffer */ 00082 q15_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */ 00083 q15_t *py = pState; /* Temporary pointers for state buffer */ 00084 int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset of the non-zero tap values. */ 00085 uint32_t delaySize = S->maxDelay + blockSize; /* state length */ 00086 uint16_t numTaps = S->numTaps; /* Filter order */ 00087 int32_t readIndex; /* Read index of the state buffer */ 00088 uint32_t tapCnt, blkCnt; /* loop counters */ 00089 q15_t coeff = *pCoeffs++; /* Read the first coefficient value */ 00090 q31_t *pScr2 = pScratchOut; /* Working pointer for pScratchOut */ 00091 00092 00093 #ifndef ARM_MATH_CM0_FAMILY 00094 00095 /* Run the below code for Cortex-M4 and Cortex-M3 */ 00096 00097 q31_t in1, in2; /* Temporary variables */ 00098 00099 00100 /* BlockSize of Input samples are copied into the state buffer */ 00101 /* StateIndex points to the starting position to write in the state buffer */ 00102 arm_circularWrite_q15(py, delaySize, &S->stateIndex, 1, pIn, 1, blockSize); 00103 00104 /* Loop over the number of taps. */ 00105 tapCnt = numTaps; 00106 00107 /* Read Index, from where the state buffer should be read, is calculated. */ 00108 readIndex = (S->stateIndex - blockSize) - *pTapDelay++; 00109 00110 /* Wraparound of readIndex */ 00111 if(readIndex < 0) 00112 { 00113 readIndex += (int32_t) delaySize; 00114 } 00115 00116 /* Working pointer for state buffer is updated */ 00117 py = pState; 00118 00119 /* blockSize samples are read from the state buffer */ 00120 arm_circularRead_q15(py, delaySize, &readIndex, 1, 00121 pb, pb, blockSize, 1, blockSize); 00122 00123 /* Working pointer for the scratch buffer of state values */ 00124 px = pb; 00125 00126 /* Working pointer for scratch buffer of output values */ 00127 pScratchOut = pScr2; 00128 00129 /* Loop over the blockSize. Unroll by a factor of 4. 00130 * Compute 4 multiplications at a time. */ 00131 blkCnt = blockSize >> 2; 00132 00133 while(blkCnt > 0u) 00134 { 00135 /* Perform multiplication and store in the scratch buffer */ 00136 *pScratchOut++ = ((q31_t) * px++ * coeff); 00137 *pScratchOut++ = ((q31_t) * px++ * coeff); 00138 *pScratchOut++ = ((q31_t) * px++ * coeff); 00139 *pScratchOut++ = ((q31_t) * px++ * coeff); 00140 00141 /* Decrement the loop counter */ 00142 blkCnt--; 00143 } 00144 00145 /* If the blockSize is not a multiple of 4, 00146 * compute the remaining samples */ 00147 blkCnt = blockSize % 0x4u; 00148 00149 while(blkCnt > 0u) 00150 { 00151 /* Perform multiplication and store in the scratch buffer */ 00152 *pScratchOut++ = ((q31_t) * px++ * coeff); 00153 00154 /* Decrement the loop counter */ 00155 blkCnt--; 00156 } 00157 00158 /* Load the coefficient value and 00159 * increment the coefficient buffer for the next set of state values */ 00160 coeff = *pCoeffs++; 00161 00162 /* Read Index, from where the state buffer should be read, is calculated. */ 00163 readIndex = (S->stateIndex - blockSize) - *pTapDelay++; 00164 00165 /* Wraparound of readIndex */ 00166 if(readIndex < 0) 00167 { 00168 readIndex += (int32_t) delaySize; 00169 } 00170 00171 /* Loop over the number of taps. */ 00172 tapCnt = (uint32_t) numTaps - 1u; 00173 00174 while(tapCnt > 0u) 00175 { 00176 /* Working pointer for state buffer is updated */ 00177 py = pState; 00178 00179 /* blockSize samples are read from the state buffer */ 00180 arm_circularRead_q15(py, delaySize, &readIndex, 1, 00181 pb, pb, blockSize, 1, blockSize); 00182 00183 /* Working pointer for the scratch buffer of state values */ 00184 px = pb; 00185 00186 /* Working pointer for scratch buffer of output values */ 00187 pScratchOut = pScr2; 00188 00189 /* Loop over the blockSize. Unroll by a factor of 4. 00190 * Compute 4 MACS at a time. */ 00191 blkCnt = blockSize >> 2; 00192 00193 while(blkCnt > 0u) 00194 { 00195 /* Perform Multiply-Accumulate */ 00196 *pScratchOut++ += (q31_t) * px++ * coeff; 00197 *pScratchOut++ += (q31_t) * px++ * coeff; 00198 *pScratchOut++ += (q31_t) * px++ * coeff; 00199 *pScratchOut++ += (q31_t) * px++ * coeff; 00200 00201 /* Decrement the loop counter */ 00202 blkCnt--; 00203 } 00204 00205 /* If the blockSize is not a multiple of 4, 00206 * compute the remaining samples */ 00207 blkCnt = blockSize % 0x4u; 00208 00209 while(blkCnt > 0u) 00210 { 00211 /* Perform Multiply-Accumulate */ 00212 *pScratchOut++ += (q31_t) * px++ * coeff; 00213 00214 /* Decrement the loop counter */ 00215 blkCnt--; 00216 } 00217 00218 /* Load the coefficient value and 00219 * increment the coefficient buffer for the next set of state values */ 00220 coeff = *pCoeffs++; 00221 00222 /* Read Index, from where the state buffer should be read, is calculated. */ 00223 readIndex = (S->stateIndex - blockSize) - *pTapDelay++; 00224 00225 /* Wraparound of readIndex */ 00226 if(readIndex < 0) 00227 { 00228 readIndex += (int32_t) delaySize; 00229 } 00230 00231 /* Decrement the tap loop counter */ 00232 tapCnt--; 00233 } 00234 00235 /* All the output values are in pScratchOut buffer. 00236 Convert them into 1.15 format, saturate and store in the destination buffer. */ 00237 /* Loop over the blockSize. */ 00238 blkCnt = blockSize >> 2; 00239 00240 while(blkCnt > 0u) 00241 { 00242 in1 = *pScr2++; 00243 in2 = *pScr2++; 00244 00245 #ifndef ARM_MATH_BIG_ENDIAN 00246 00247 *__SIMD32(pOut)++ = 00248 __PKHBT((q15_t) __SSAT(in1 >> 15, 16), (q15_t) __SSAT(in2 >> 15, 16), 00249 16); 00250 00251 #else 00252 *__SIMD32(pOut)++ = 00253 __PKHBT((q15_t) __SSAT(in2 >> 15, 16), (q15_t) __SSAT(in1 >> 15, 16), 00254 16); 00255 00256 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00257 00258 in1 = *pScr2++; 00259 00260 in2 = *pScr2++; 00261 00262 #ifndef ARM_MATH_BIG_ENDIAN 00263 00264 *__SIMD32(pOut)++ = 00265 __PKHBT((q15_t) __SSAT(in1 >> 15, 16), (q15_t) __SSAT(in2 >> 15, 16), 00266 16); 00267 00268 #else 00269 00270 *__SIMD32(pOut)++ = 00271 __PKHBT((q15_t) __SSAT(in2 >> 15, 16), (q15_t) __SSAT(in1 >> 15, 16), 00272 16); 00273 00274 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00275 00276 00277 blkCnt--; 00278 00279 } 00280 00281 /* If the blockSize is not a multiple of 4, 00282 remaining samples are processed in the below loop */ 00283 blkCnt = blockSize % 0x4u; 00284 00285 while(blkCnt > 0u) 00286 { 00287 *pOut++ = (q15_t) __SSAT(*pScr2++ >> 15, 16); 00288 blkCnt--; 00289 } 00290 00291 #else 00292 00293 /* Run the below code for Cortex-M0 */ 00294 00295 /* BlockSize of Input samples are copied into the state buffer */ 00296 /* StateIndex points to the starting position to write in the state buffer */ 00297 arm_circularWrite_q15(py, delaySize, &S->stateIndex, 1, pIn, 1, blockSize); 00298 00299 /* Loop over the number of taps. */ 00300 tapCnt = numTaps; 00301 00302 /* Read Index, from where the state buffer should be read, is calculated. */ 00303 readIndex = (S->stateIndex - blockSize) - *pTapDelay++; 00304 00305 /* Wraparound of readIndex */ 00306 if(readIndex < 0) 00307 { 00308 readIndex += (int32_t) delaySize; 00309 } 00310 00311 /* Working pointer for state buffer is updated */ 00312 py = pState; 00313 00314 /* blockSize samples are read from the state buffer */ 00315 arm_circularRead_q15(py, delaySize, &readIndex, 1, 00316 pb, pb, blockSize, 1, blockSize); 00317 00318 /* Working pointer for the scratch buffer of state values */ 00319 px = pb; 00320 00321 /* Working pointer for scratch buffer of output values */ 00322 pScratchOut = pScr2; 00323 00324 blkCnt = blockSize; 00325 00326 while(blkCnt > 0u) 00327 { 00328 /* Perform multiplication and store in the scratch buffer */ 00329 *pScratchOut++ = ((q31_t) * px++ * coeff); 00330 00331 /* Decrement the loop counter */ 00332 blkCnt--; 00333 } 00334 00335 /* Load the coefficient value and 00336 * increment the coefficient buffer for the next set of state values */ 00337 coeff = *pCoeffs++; 00338 00339 /* Read Index, from where the state buffer should be read, is calculated. */ 00340 readIndex = (S->stateIndex - blockSize) - *pTapDelay++; 00341 00342 /* Wraparound of readIndex */ 00343 if(readIndex < 0) 00344 { 00345 readIndex += (int32_t) delaySize; 00346 } 00347 00348 /* Loop over the number of taps. */ 00349 tapCnt = (uint32_t) numTaps - 1u; 00350 00351 while(tapCnt > 0u) 00352 { 00353 /* Working pointer for state buffer is updated */ 00354 py = pState; 00355 00356 /* blockSize samples are read from the state buffer */ 00357 arm_circularRead_q15(py, delaySize, &readIndex, 1, 00358 pb, pb, blockSize, 1, blockSize); 00359 00360 /* Working pointer for the scratch buffer of state values */ 00361 px = pb; 00362 00363 /* Working pointer for scratch buffer of output values */ 00364 pScratchOut = pScr2; 00365 00366 blkCnt = blockSize; 00367 00368 while(blkCnt > 0u) 00369 { 00370 /* Perform Multiply-Accumulate */ 00371 *pScratchOut++ += (q31_t) * px++ * coeff; 00372 00373 /* Decrement the loop counter */ 00374 blkCnt--; 00375 } 00376 00377 /* Load the coefficient value and 00378 * increment the coefficient buffer for the next set of state values */ 00379 coeff = *pCoeffs++; 00380 00381 /* Read Index, from where the state buffer should be read, is calculated. */ 00382 readIndex = (S->stateIndex - blockSize) - *pTapDelay++; 00383 00384 /* Wraparound of readIndex */ 00385 if(readIndex < 0) 00386 { 00387 readIndex += (int32_t) delaySize; 00388 } 00389 00390 /* Decrement the tap loop counter */ 00391 tapCnt--; 00392 } 00393 00394 /* All the output values are in pScratchOut buffer. 00395 Convert them into 1.15 format, saturate and store in the destination buffer. */ 00396 /* Loop over the blockSize. */ 00397 blkCnt = blockSize; 00398 00399 while(blkCnt > 0u) 00400 { 00401 *pOut++ = (q15_t) __SSAT(*pScr2++ >> 15, 16); 00402 blkCnt--; 00403 } 00404 00405 #endif /* #ifndef ARM_MATH_CM0_FAMILY */ 00406 00407 } 00408 00409 /** 00410 * @} end of FIR_Sparse group 00411 */
Generated on Tue Jul 12 2022 12:36:55 by 1.7.2