CMSIS DSP library
Dependents: performance_timer Surfboard_ gps2rtty Capstone ... more
arm_fir_sparse_q31.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010-2014 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 19. March 2015 00005 * $Revision: V.1.4.5 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_fir_sparse_q31.c 00009 * 00010 * Description: Q31 sparse FIR filter processing function. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * ------------------------------------------------------------------- */ 00040 #include "arm_math.h" 00041 00042 00043 /** 00044 * @addtogroup FIR_Sparse 00045 * @{ 00046 */ 00047 00048 /** 00049 * @brief Processing function for the Q31 sparse FIR filter. 00050 * @param[in] *S points to an instance of the Q31 sparse FIR structure. 00051 * @param[in] *pSrc points to the block of input data. 00052 * @param[out] *pDst points to the block of output data 00053 * @param[in] *pScratchIn points to a temporary buffer of size blockSize. 00054 * @param[in] blockSize number of input samples to process per call. 00055 * @return none. 00056 * 00057 * <b>Scaling and Overflow Behavior:</b> 00058 * \par 00059 * The function is implemented using an internal 32-bit accumulator. 00060 * The 1.31 x 1.31 multiplications are truncated to 2.30 format. 00061 * This leads to loss of precision on the intermediate multiplications and provides only a single guard bit. 00062 * If the accumulator result overflows, it wraps around rather than saturate. 00063 * In order to avoid overflows the input signal or coefficients must be scaled down by log2(numTaps) bits. 00064 */ 00065 00066 void arm_fir_sparse_q31( 00067 arm_fir_sparse_instance_q31 * S, 00068 q31_t * pSrc, 00069 q31_t * pDst, 00070 q31_t * pScratchIn, 00071 uint32_t blockSize) 00072 { 00073 00074 q31_t *pState = S->pState; /* State pointer */ 00075 q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ 00076 q31_t *px; /* Scratch buffer pointer */ 00077 q31_t *py = pState; /* Temporary pointers for state buffer */ 00078 q31_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */ 00079 q31_t *pOut; /* Destination pointer */ 00080 q63_t out; /* Temporary output variable */ 00081 int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset of the non-zero tap values. */ 00082 uint32_t delaySize = S->maxDelay + blockSize; /* state length */ 00083 uint16_t numTaps = S->numTaps; /* Filter order */ 00084 int32_t readIndex; /* Read index of the state buffer */ 00085 uint32_t tapCnt, blkCnt; /* loop counters */ 00086 q31_t coeff = *pCoeffs++; /* Read the first coefficient value */ 00087 q31_t in; 00088 00089 00090 /* BlockSize of Input samples are copied into the state buffer */ 00091 /* StateIndex points to the starting position to write in the state buffer */ 00092 arm_circularWrite_f32((int32_t *) py, delaySize, &S->stateIndex, 1, 00093 (int32_t *) pSrc, 1, blockSize); 00094 00095 /* Read Index, from where the state buffer should be read, is calculated. */ 00096 readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++; 00097 00098 /* Wraparound of readIndex */ 00099 if(readIndex < 0) 00100 { 00101 readIndex += (int32_t) delaySize; 00102 } 00103 00104 /* Working pointer for state buffer is updated */ 00105 py = pState; 00106 00107 /* blockSize samples are read from the state buffer */ 00108 arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1, 00109 (int32_t *) pb, (int32_t *) pb, blockSize, 1, 00110 blockSize); 00111 00112 /* Working pointer for the scratch buffer of state values */ 00113 px = pb; 00114 00115 /* Working pointer for scratch buffer of output values */ 00116 pOut = pDst; 00117 00118 00119 #ifndef ARM_MATH_CM0_FAMILY 00120 00121 /* Run the below code for Cortex-M4 and Cortex-M3 */ 00122 00123 /* Loop over the blockSize. Unroll by a factor of 4. 00124 * Compute 4 Multiplications at a time. */ 00125 blkCnt = blockSize >> 2; 00126 00127 while(blkCnt > 0u) 00128 { 00129 /* Perform Multiplications and store in the destination buffer */ 00130 *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32); 00131 *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32); 00132 *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32); 00133 *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32); 00134 00135 /* Decrement the loop counter */ 00136 blkCnt--; 00137 } 00138 00139 /* If the blockSize is not a multiple of 4, 00140 * compute the remaining samples */ 00141 blkCnt = blockSize % 0x4u; 00142 00143 while(blkCnt > 0u) 00144 { 00145 /* Perform Multiplications and store in the destination buffer */ 00146 *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32); 00147 00148 /* Decrement the loop counter */ 00149 blkCnt--; 00150 } 00151 00152 /* Load the coefficient value and 00153 * increment the coefficient buffer for the next set of state values */ 00154 coeff = *pCoeffs++; 00155 00156 /* Read Index, from where the state buffer should be read, is calculated. */ 00157 readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++; 00158 00159 /* Wraparound of readIndex */ 00160 if(readIndex < 0) 00161 { 00162 readIndex += (int32_t) delaySize; 00163 } 00164 00165 /* Loop over the number of taps. */ 00166 tapCnt = (uint32_t) numTaps - 2u; 00167 00168 while(tapCnt > 0u) 00169 { 00170 /* Working pointer for state buffer is updated */ 00171 py = pState; 00172 00173 /* blockSize samples are read from the state buffer */ 00174 arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1, 00175 (int32_t *) pb, (int32_t *) pb, blockSize, 1, 00176 blockSize); 00177 00178 /* Working pointer for the scratch buffer of state values */ 00179 px = pb; 00180 00181 /* Working pointer for scratch buffer of output values */ 00182 pOut = pDst; 00183 00184 /* Loop over the blockSize. Unroll by a factor of 4. 00185 * Compute 4 MACS at a time. */ 00186 blkCnt = blockSize >> 2; 00187 00188 while(blkCnt > 0u) 00189 { 00190 out = *pOut; 00191 out += ((q63_t) * px++ * coeff) >> 32; 00192 *pOut++ = (q31_t) (out); 00193 00194 out = *pOut; 00195 out += ((q63_t) * px++ * coeff) >> 32; 00196 *pOut++ = (q31_t) (out); 00197 00198 out = *pOut; 00199 out += ((q63_t) * px++ * coeff) >> 32; 00200 *pOut++ = (q31_t) (out); 00201 00202 out = *pOut; 00203 out += ((q63_t) * px++ * coeff) >> 32; 00204 *pOut++ = (q31_t) (out); 00205 00206 /* Decrement the loop counter */ 00207 blkCnt--; 00208 } 00209 00210 /* If the blockSize is not a multiple of 4, 00211 * compute the remaining samples */ 00212 blkCnt = blockSize % 0x4u; 00213 00214 while(blkCnt > 0u) 00215 { 00216 /* Perform Multiply-Accumulate */ 00217 out = *pOut; 00218 out += ((q63_t) * px++ * coeff) >> 32; 00219 *pOut++ = (q31_t) (out); 00220 00221 /* Decrement the loop counter */ 00222 blkCnt--; 00223 } 00224 00225 /* Load the coefficient value and 00226 * increment the coefficient buffer for the next set of state values */ 00227 coeff = *pCoeffs++; 00228 00229 /* Read Index, from where the state buffer should be read, is calculated. */ 00230 readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++; 00231 00232 /* Wraparound of readIndex */ 00233 if(readIndex < 0) 00234 { 00235 readIndex += (int32_t) delaySize; 00236 } 00237 00238 /* Decrement the tap loop counter */ 00239 tapCnt--; 00240 } 00241 00242 /* Compute last tap without the final read of pTapDelay */ 00243 00244 /* Working pointer for state buffer is updated */ 00245 py = pState; 00246 00247 /* blockSize samples are read from the state buffer */ 00248 arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1, 00249 (int32_t *) pb, (int32_t *) pb, blockSize, 1, 00250 blockSize); 00251 00252 /* Working pointer for the scratch buffer of state values */ 00253 px = pb; 00254 00255 /* Working pointer for scratch buffer of output values */ 00256 pOut = pDst; 00257 00258 /* Loop over the blockSize. Unroll by a factor of 4. 00259 * Compute 4 MACS at a time. */ 00260 blkCnt = blockSize >> 2; 00261 00262 while(blkCnt > 0u) 00263 { 00264 out = *pOut; 00265 out += ((q63_t) * px++ * coeff) >> 32; 00266 *pOut++ = (q31_t) (out); 00267 00268 out = *pOut; 00269 out += ((q63_t) * px++ * coeff) >> 32; 00270 *pOut++ = (q31_t) (out); 00271 00272 out = *pOut; 00273 out += ((q63_t) * px++ * coeff) >> 32; 00274 *pOut++ = (q31_t) (out); 00275 00276 out = *pOut; 00277 out += ((q63_t) * px++ * coeff) >> 32; 00278 *pOut++ = (q31_t) (out); 00279 00280 /* Decrement the loop counter */ 00281 blkCnt--; 00282 } 00283 00284 /* If the blockSize is not a multiple of 4, 00285 * compute the remaining samples */ 00286 blkCnt = blockSize % 0x4u; 00287 00288 while(blkCnt > 0u) 00289 { 00290 /* Perform Multiply-Accumulate */ 00291 out = *pOut; 00292 out += ((q63_t) * px++ * coeff) >> 32; 00293 *pOut++ = (q31_t) (out); 00294 00295 /* Decrement the loop counter */ 00296 blkCnt--; 00297 } 00298 00299 /* Working output pointer is updated */ 00300 pOut = pDst; 00301 00302 /* Output is converted into 1.31 format. */ 00303 /* Loop over the blockSize. Unroll by a factor of 4. 00304 * process 4 output samples at a time. */ 00305 blkCnt = blockSize >> 2; 00306 00307 while(blkCnt > 0u) 00308 { 00309 in = *pOut << 1; 00310 *pOut++ = in; 00311 in = *pOut << 1; 00312 *pOut++ = in; 00313 in = *pOut << 1; 00314 *pOut++ = in; 00315 in = *pOut << 1; 00316 *pOut++ = in; 00317 00318 /* Decrement the loop counter */ 00319 blkCnt--; 00320 } 00321 00322 /* If the blockSize is not a multiple of 4, 00323 * process the remaining output samples */ 00324 blkCnt = blockSize % 0x4u; 00325 00326 while(blkCnt > 0u) 00327 { 00328 in = *pOut << 1; 00329 *pOut++ = in; 00330 00331 /* Decrement the loop counter */ 00332 blkCnt--; 00333 } 00334 00335 #else 00336 00337 /* Run the below code for Cortex-M0 */ 00338 blkCnt = blockSize; 00339 00340 while(blkCnt > 0u) 00341 { 00342 /* Perform Multiplications and store in the destination buffer */ 00343 *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32); 00344 00345 /* Decrement the loop counter */ 00346 blkCnt--; 00347 } 00348 00349 /* Load the coefficient value and 00350 * increment the coefficient buffer for the next set of state values */ 00351 coeff = *pCoeffs++; 00352 00353 /* Read Index, from where the state buffer should be read, is calculated. */ 00354 readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++; 00355 00356 /* Wraparound of readIndex */ 00357 if(readIndex < 0) 00358 { 00359 readIndex += (int32_t) delaySize; 00360 } 00361 00362 /* Loop over the number of taps. */ 00363 tapCnt = (uint32_t) numTaps - 2u; 00364 00365 while(tapCnt > 0u) 00366 { 00367 /* Working pointer for state buffer is updated */ 00368 py = pState; 00369 00370 /* blockSize samples are read from the state buffer */ 00371 arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1, 00372 (int32_t *) pb, (int32_t *) pb, blockSize, 1, 00373 blockSize); 00374 00375 /* Working pointer for the scratch buffer of state values */ 00376 px = pb; 00377 00378 /* Working pointer for scratch buffer of output values */ 00379 pOut = pDst; 00380 00381 blkCnt = blockSize; 00382 00383 while(blkCnt > 0u) 00384 { 00385 /* Perform Multiply-Accumulate */ 00386 out = *pOut; 00387 out += ((q63_t) * px++ * coeff) >> 32; 00388 *pOut++ = (q31_t) (out); 00389 00390 /* Decrement the loop counter */ 00391 blkCnt--; 00392 } 00393 00394 /* Load the coefficient value and 00395 * increment the coefficient buffer for the next set of state values */ 00396 coeff = *pCoeffs++; 00397 00398 /* Read Index, from where the state buffer should be read, is calculated. */ 00399 readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++; 00400 00401 /* Wraparound of readIndex */ 00402 if(readIndex < 0) 00403 { 00404 readIndex += (int32_t) delaySize; 00405 } 00406 00407 /* Decrement the tap loop counter */ 00408 tapCnt--; 00409 } 00410 00411 /* Compute last tap without the final read of pTapDelay */ 00412 00413 /* Working pointer for state buffer is updated */ 00414 py = pState; 00415 00416 /* blockSize samples are read from the state buffer */ 00417 arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1, 00418 (int32_t *) pb, (int32_t *) pb, blockSize, 1, 00419 blockSize); 00420 00421 /* Working pointer for the scratch buffer of state values */ 00422 px = pb; 00423 00424 /* Working pointer for scratch buffer of output values */ 00425 pOut = pDst; 00426 00427 blkCnt = blockSize; 00428 00429 while(blkCnt > 0u) 00430 { 00431 /* Perform Multiply-Accumulate */ 00432 out = *pOut; 00433 out += ((q63_t) * px++ * coeff) >> 32; 00434 *pOut++ = (q31_t) (out); 00435 00436 /* Decrement the loop counter */ 00437 blkCnt--; 00438 } 00439 00440 /* Working output pointer is updated */ 00441 pOut = pDst; 00442 00443 /* Output is converted into 1.31 format. */ 00444 blkCnt = blockSize; 00445 00446 while(blkCnt > 0u) 00447 { 00448 in = *pOut << 1; 00449 *pOut++ = in; 00450 00451 /* Decrement the loop counter */ 00452 blkCnt--; 00453 } 00454 00455 #endif /* #ifndef ARM_MATH_CM0_FAMILY */ 00456 00457 } 00458 00459 /** 00460 * @} end of FIR_Sparse group 00461 */
Generated on Tue Jul 12 2022 11:59:17 by 1.7.2