Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
Fork of OmniWheels by
arm_fir_sparse_q31.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010-2014 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 19. March 2015 00005 * $Revision: V.1.4.5 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_fir_sparse_q31.c 00009 * 00010 * Description: Q31 sparse FIR filter processing function. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * ------------------------------------------------------------------- */ 00040 #include "arm_math.h" 00041 00042 00043 /** 00044 * @addtogroup FIR_Sparse 00045 * @{ 00046 */ 00047 00048 /** 00049 * @brief Processing function for the Q31 sparse FIR filter. 00050 * @param[in] *S points to an instance of the Q31 sparse FIR structure. 00051 * @param[in] *pSrc points to the block of input data. 00052 * @param[out] *pDst points to the block of output data 00053 * @param[in] *pScratchIn points to a temporary buffer of size blockSize. 00054 * @param[in] blockSize number of input samples to process per call. 00055 * @return none. 00056 * 00057 * <b>Scaling and Overflow Behavior:</b> 00058 * \par 00059 * The function is implemented using an internal 32-bit accumulator. 00060 * The 1.31 x 1.31 multiplications are truncated to 2.30 format. 00061 * This leads to loss of precision on the intermediate multiplications and provides only a single guard bit. 00062 * If the accumulator result overflows, it wraps around rather than saturate. 00063 * In order to avoid overflows the input signal or coefficients must be scaled down by log2(numTaps) bits. 00064 */ 00065 00066 void arm_fir_sparse_q31( 00067 arm_fir_sparse_instance_q31 * S, 00068 q31_t * pSrc, 00069 q31_t * pDst, 00070 q31_t * pScratchIn, 00071 uint32_t blockSize) 00072 { 00073 00074 q31_t *pState = S->pState; /* State pointer */ 00075 q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ 00076 q31_t *px; /* Scratch buffer pointer */ 00077 q31_t *py = pState; /* Temporary pointers for state buffer */ 00078 q31_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */ 00079 q31_t *pOut; /* Destination pointer */ 00080 q63_t out; /* Temporary output variable */ 00081 int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset of the non-zero tap values. */ 00082 uint32_t delaySize = S->maxDelay + blockSize; /* state length */ 00083 uint16_t numTaps = S->numTaps; /* Filter order */ 00084 int32_t readIndex; /* Read index of the state buffer */ 00085 uint32_t tapCnt, blkCnt; /* loop counters */ 00086 q31_t coeff = *pCoeffs++; /* Read the first coefficient value */ 00087 q31_t in; 00088 00089 00090 /* BlockSize of Input samples are copied into the state buffer */ 00091 /* StateIndex points to the starting position to write in the state buffer */ 00092 arm_circularWrite_f32((int32_t *) py, delaySize, &S->stateIndex, 1, 00093 (int32_t *) pSrc, 1, blockSize); 00094 00095 /* Read Index, from where the state buffer should be read, is calculated. */ 00096 readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++; 00097 00098 /* Wraparound of readIndex */ 00099 if(readIndex < 0) 00100 { 00101 readIndex += (int32_t) delaySize; 00102 } 00103 00104 /* Working pointer for state buffer is updated */ 00105 py = pState; 00106 00107 /* blockSize samples are read from the state buffer */ 00108 arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1, 00109 (int32_t *) pb, (int32_t *) pb, blockSize, 1, 00110 blockSize); 00111 00112 /* Working pointer for the scratch buffer of state values */ 00113 px = pb; 00114 00115 /* Working pointer for scratch buffer of output values */ 00116 pOut = pDst; 00117 00118 00119 #ifndef ARM_MATH_CM0_FAMILY 00120 00121 /* Run the below code for Cortex-M4 and Cortex-M3 */ 00122 00123 /* Loop over the blockSize. Unroll by a factor of 4. 00124 * Compute 4 Multiplications at a time. */ 00125 blkCnt = blockSize >> 2; 00126 00127 while(blkCnt > 0u) 00128 { 00129 /* Perform Multiplications and store in the destination buffer */ 00130 *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32); 00131 *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32); 00132 *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32); 00133 *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32); 00134 00135 /* Decrement the loop counter */ 00136 blkCnt--; 00137 } 00138 00139 /* If the blockSize is not a multiple of 4, 00140 * compute the remaining samples */ 00141 blkCnt = blockSize % 0x4u; 00142 00143 while(blkCnt > 0u) 00144 { 00145 /* Perform Multiplications and store in the destination buffer */ 00146 *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32); 00147 00148 /* Decrement the loop counter */ 00149 blkCnt--; 00150 } 00151 00152 /* Load the coefficient value and 00153 * increment the coefficient buffer for the next set of state values */ 00154 coeff = *pCoeffs++; 00155 00156 /* Read Index, from where the state buffer should be read, is calculated. */ 00157 readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++; 00158 00159 /* Wraparound of readIndex */ 00160 if(readIndex < 0) 00161 { 00162 readIndex += (int32_t) delaySize; 00163 } 00164 00165 /* Loop over the number of taps. */ 00166 tapCnt = (uint32_t) numTaps - 2u; 00167 00168 while(tapCnt > 0u) 00169 { 00170 /* Working pointer for state buffer is updated */ 00171 py = pState; 00172 00173 /* blockSize samples are read from the state buffer */ 00174 arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1, 00175 (int32_t *) pb, (int32_t *) pb, blockSize, 1, 00176 blockSize); 00177 00178 /* Working pointer for the scratch buffer of state values */ 00179 px = pb; 00180 00181 /* Working pointer for scratch buffer of output values */ 00182 pOut = pDst; 00183 00184 /* Loop over the blockSize. Unroll by a factor of 4. 00185 * Compute 4 MACS at a time. */ 00186 blkCnt = blockSize >> 2; 00187 00188 while(blkCnt > 0u) 00189 { 00190 out = *pOut; 00191 out += ((q63_t) * px++ * coeff) >> 32; 00192 *pOut++ = (q31_t) (out); 00193 00194 out = *pOut; 00195 out += ((q63_t) * px++ * coeff) >> 32; 00196 *pOut++ = (q31_t) (out); 00197 00198 out = *pOut; 00199 out += ((q63_t) * px++ * coeff) >> 32; 00200 *pOut++ = (q31_t) (out); 00201 00202 out = *pOut; 00203 out += ((q63_t) * px++ * coeff) >> 32; 00204 *pOut++ = (q31_t) (out); 00205 00206 /* Decrement the loop counter */ 00207 blkCnt--; 00208 } 00209 00210 /* If the blockSize is not a multiple of 4, 00211 * compute the remaining samples */ 00212 blkCnt = blockSize % 0x4u; 00213 00214 while(blkCnt > 0u) 00215 { 00216 /* Perform Multiply-Accumulate */ 00217 out = *pOut; 00218 out += ((q63_t) * px++ * coeff) >> 32; 00219 *pOut++ = (q31_t) (out); 00220 00221 /* Decrement the loop counter */ 00222 blkCnt--; 00223 } 00224 00225 /* Load the coefficient value and 00226 * increment the coefficient buffer for the next set of state values */ 00227 coeff = *pCoeffs++; 00228 00229 /* Read Index, from where the state buffer should be read, is calculated. */ 00230 readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++; 00231 00232 /* Wraparound of readIndex */ 00233 if(readIndex < 0) 00234 { 00235 readIndex += (int32_t) delaySize; 00236 } 00237 00238 /* Decrement the tap loop counter */ 00239 tapCnt--; 00240 } 00241 00242 /* Compute last tap without the final read of pTapDelay */ 00243 00244 /* Working pointer for state buffer is updated */ 00245 py = pState; 00246 00247 /* blockSize samples are read from the state buffer */ 00248 arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1, 00249 (int32_t *) pb, (int32_t *) pb, blockSize, 1, 00250 blockSize); 00251 00252 /* Working pointer for the scratch buffer of state values */ 00253 px = pb; 00254 00255 /* Working pointer for scratch buffer of output values */ 00256 pOut = pDst; 00257 00258 /* Loop over the blockSize. Unroll by a factor of 4. 00259 * Compute 4 MACS at a time. */ 00260 blkCnt = blockSize >> 2; 00261 00262 while(blkCnt > 0u) 00263 { 00264 out = *pOut; 00265 out += ((q63_t) * px++ * coeff) >> 32; 00266 *pOut++ = (q31_t) (out); 00267 00268 out = *pOut; 00269 out += ((q63_t) * px++ * coeff) >> 32; 00270 *pOut++ = (q31_t) (out); 00271 00272 out = *pOut; 00273 out += ((q63_t) * px++ * coeff) >> 32; 00274 *pOut++ = (q31_t) (out); 00275 00276 out = *pOut; 00277 out += ((q63_t) * px++ * coeff) >> 32; 00278 *pOut++ = (q31_t) (out); 00279 00280 /* Decrement the loop counter */ 00281 blkCnt--; 00282 } 00283 00284 /* If the blockSize is not a multiple of 4, 00285 * compute the remaining samples */ 00286 blkCnt = blockSize % 0x4u; 00287 00288 while(blkCnt > 0u) 00289 { 00290 /* Perform Multiply-Accumulate */ 00291 out = *pOut; 00292 out += ((q63_t) * px++ * coeff) >> 32; 00293 *pOut++ = (q31_t) (out); 00294 00295 /* Decrement the loop counter */ 00296 blkCnt--; 00297 } 00298 00299 /* Working output pointer is updated */ 00300 pOut = pDst; 00301 00302 /* Output is converted into 1.31 format. */ 00303 /* Loop over the blockSize. Unroll by a factor of 4. 00304 * process 4 output samples at a time. */ 00305 blkCnt = blockSize >> 2; 00306 00307 while(blkCnt > 0u) 00308 { 00309 in = *pOut << 1; 00310 *pOut++ = in; 00311 in = *pOut << 1; 00312 *pOut++ = in; 00313 in = *pOut << 1; 00314 *pOut++ = in; 00315 in = *pOut << 1; 00316 *pOut++ = in; 00317 00318 /* Decrement the loop counter */ 00319 blkCnt--; 00320 } 00321 00322 /* If the blockSize is not a multiple of 4, 00323 * process the remaining output samples */ 00324 blkCnt = blockSize % 0x4u; 00325 00326 while(blkCnt > 0u) 00327 { 00328 in = *pOut << 1; 00329 *pOut++ = in; 00330 00331 /* Decrement the loop counter */ 00332 blkCnt--; 00333 } 00334 00335 #else 00336 00337 /* Run the below code for Cortex-M0 */ 00338 blkCnt = blockSize; 00339 00340 while(blkCnt > 0u) 00341 { 00342 /* Perform Multiplications and store in the destination buffer */ 00343 *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32); 00344 00345 /* Decrement the loop counter */ 00346 blkCnt--; 00347 } 00348 00349 /* Load the coefficient value and 00350 * increment the coefficient buffer for the next set of state values */ 00351 coeff = *pCoeffs++; 00352 00353 /* Read Index, from where the state buffer should be read, is calculated. */ 00354 readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++; 00355 00356 /* Wraparound of readIndex */ 00357 if(readIndex < 0) 00358 { 00359 readIndex += (int32_t) delaySize; 00360 } 00361 00362 /* Loop over the number of taps. */ 00363 tapCnt = (uint32_t) numTaps - 2u; 00364 00365 while(tapCnt > 0u) 00366 { 00367 /* Working pointer for state buffer is updated */ 00368 py = pState; 00369 00370 /* blockSize samples are read from the state buffer */ 00371 arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1, 00372 (int32_t *) pb, (int32_t *) pb, blockSize, 1, 00373 blockSize); 00374 00375 /* Working pointer for the scratch buffer of state values */ 00376 px = pb; 00377 00378 /* Working pointer for scratch buffer of output values */ 00379 pOut = pDst; 00380 00381 blkCnt = blockSize; 00382 00383 while(blkCnt > 0u) 00384 { 00385 /* Perform Multiply-Accumulate */ 00386 out = *pOut; 00387 out += ((q63_t) * px++ * coeff) >> 32; 00388 *pOut++ = (q31_t) (out); 00389 00390 /* Decrement the loop counter */ 00391 blkCnt--; 00392 } 00393 00394 /* Load the coefficient value and 00395 * increment the coefficient buffer for the next set of state values */ 00396 coeff = *pCoeffs++; 00397 00398 /* Read Index, from where the state buffer should be read, is calculated. */ 00399 readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++; 00400 00401 /* Wraparound of readIndex */ 00402 if(readIndex < 0) 00403 { 00404 readIndex += (int32_t) delaySize; 00405 } 00406 00407 /* Decrement the tap loop counter */ 00408 tapCnt--; 00409 } 00410 00411 /* Compute last tap without the final read of pTapDelay */ 00412 00413 /* Working pointer for state buffer is updated */ 00414 py = pState; 00415 00416 /* blockSize samples are read from the state buffer */ 00417 arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1, 00418 (int32_t *) pb, (int32_t *) pb, blockSize, 1, 00419 blockSize); 00420 00421 /* Working pointer for the scratch buffer of state values */ 00422 px = pb; 00423 00424 /* Working pointer for scratch buffer of output values */ 00425 pOut = pDst; 00426 00427 blkCnt = blockSize; 00428 00429 while(blkCnt > 0u) 00430 { 00431 /* Perform Multiply-Accumulate */ 00432 out = *pOut; 00433 out += ((q63_t) * px++ * coeff) >> 32; 00434 *pOut++ = (q31_t) (out); 00435 00436 /* Decrement the loop counter */ 00437 blkCnt--; 00438 } 00439 00440 /* Working output pointer is updated */ 00441 pOut = pDst; 00442 00443 /* Output is converted into 1.31 format. */ 00444 blkCnt = blockSize; 00445 00446 while(blkCnt > 0u) 00447 { 00448 in = *pOut << 1; 00449 *pOut++ = in; 00450 00451 /* Decrement the loop counter */ 00452 blkCnt--; 00453 } 00454 00455 #endif /* #ifndef ARM_MATH_CM0_FAMILY */ 00456 00457 } 00458 00459 /** 00460 * @} end of FIR_Sparse group 00461 */
Generated on Fri Jul 22 2022 04:53:44 by
1.7.2
