Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
Fork of mbed-os by
arm_conv_q31.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010-2014 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 19. March 2015 00005 * $Revision: V.1.4.5 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_conv_q31.c 00009 * 00010 * Description: Convolution of Q31 sequences. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * -------------------------------------------------------------------- */ 00040 00041 #include "arm_math.h" 00042 00043 /** 00044 * @ingroup groupFilters 00045 */ 00046 00047 /** 00048 * @addtogroup Conv 00049 * @{ 00050 */ 00051 00052 /** 00053 * @brief Convolution of Q31 sequences. 00054 * @param[in] *pSrcA points to the first input sequence. 00055 * @param[in] srcALen length of the first input sequence. 00056 * @param[in] *pSrcB points to the second input sequence. 00057 * @param[in] srcBLen length of the second input sequence. 00058 * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1. 00059 * @return none. 00060 * 00061 * @details 00062 * <b>Scaling and Overflow Behavior:</b> 00063 * 00064 * \par 00065 * The function is implemented using an internal 64-bit accumulator. 00066 * The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit. 00067 * There is no saturation on intermediate additions. 00068 * Thus, if the accumulator overflows it wraps around and distorts the result. 00069 * The input signals should be scaled down to avoid intermediate overflows. 00070 * Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows, 00071 * as maximum of min(srcALen, srcBLen) number of additions are carried internally. 00072 * The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result. 00073 * 00074 * \par 00075 * See <code>arm_conv_fast_q31()</code> for a faster but less precise implementation of this function for Cortex-M3 and Cortex-M4. 00076 */ 00077 00078 void arm_conv_q31( 00079 q31_t * pSrcA, 00080 uint32_t srcALen, 00081 q31_t * pSrcB, 00082 uint32_t srcBLen, 00083 q31_t * pDst) 00084 { 00085 00086 00087 #ifndef ARM_MATH_CM0_FAMILY 00088 00089 /* Run the below code for Cortex-M4 and Cortex-M3 */ 00090 00091 q31_t *pIn1; /* inputA pointer */ 00092 q31_t *pIn2; /* inputB pointer */ 00093 q31_t *pOut = pDst; /* output pointer */ 00094 q31_t *px; /* Intermediate inputA pointer */ 00095 q31_t *py; /* Intermediate inputB pointer */ 00096 q31_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00097 q63_t sum; /* Accumulator */ 00098 q63_t acc0, acc1, acc2; /* Accumulator */ 00099 q31_t x0, x1, x2, c0; /* Temporary variables to hold state and coefficient values */ 00100 uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3; /* loop counter */ 00101 00102 /* The algorithm implementation is based on the lengths of the inputs. */ 00103 /* srcB is always made to slide across srcA. */ 00104 /* So srcBLen is always considered as shorter or equal to srcALen */ 00105 if(srcALen >= srcBLen) 00106 { 00107 /* Initialization of inputA pointer */ 00108 pIn1 = pSrcA; 00109 00110 /* Initialization of inputB pointer */ 00111 pIn2 = pSrcB; 00112 } 00113 else 00114 { 00115 /* Initialization of inputA pointer */ 00116 pIn1 = (q31_t *) pSrcB; 00117 00118 /* Initialization of inputB pointer */ 00119 pIn2 = (q31_t *) pSrcA; 00120 00121 /* srcBLen is always considered as shorter or equal to srcALen */ 00122 j = srcBLen; 00123 srcBLen = srcALen; 00124 srcALen = j; 00125 } 00126 00127 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00128 /* The function is internally 00129 * divided into three stages according to the number of multiplications that has to be 00130 * taken place between inputA samples and inputB samples. In the first stage of the 00131 * algorithm, the multiplications increase by one for every iteration. 00132 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00133 * In the third stage of the algorithm, the multiplications decrease by one 00134 * for every iteration. */ 00135 00136 /* The algorithm is implemented in three stages. 00137 The loop counters of each stage is initiated here. */ 00138 blockSize1 = srcBLen - 1u; 00139 blockSize2 = srcALen - (srcBLen - 1u); 00140 blockSize3 = blockSize1; 00141 00142 /* -------------------------- 00143 * Initializations of stage1 00144 * -------------------------*/ 00145 00146 /* sum = x[0] * y[0] 00147 * sum = x[0] * y[1] + x[1] * y[0] 00148 * .... 00149 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00150 */ 00151 00152 /* In this stage the MAC operations are increased by 1 for every iteration. 00153 The count variable holds the number of MAC operations performed */ 00154 count = 1u; 00155 00156 /* Working pointer of inputA */ 00157 px = pIn1; 00158 00159 /* Working pointer of inputB */ 00160 py = pIn2; 00161 00162 00163 /* ------------------------ 00164 * Stage1 process 00165 * ----------------------*/ 00166 00167 /* The first stage starts here */ 00168 while(blockSize1 > 0u) 00169 { 00170 /* Accumulator is made zero for every iteration */ 00171 sum = 0; 00172 00173 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00174 k = count >> 2u; 00175 00176 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00177 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00178 while(k > 0u) 00179 { 00180 /* x[0] * y[srcBLen - 1] */ 00181 sum += (q63_t) * px++ * (*py--); 00182 /* x[1] * y[srcBLen - 2] */ 00183 sum += (q63_t) * px++ * (*py--); 00184 /* x[2] * y[srcBLen - 3] */ 00185 sum += (q63_t) * px++ * (*py--); 00186 /* x[3] * y[srcBLen - 4] */ 00187 sum += (q63_t) * px++ * (*py--); 00188 00189 /* Decrement the loop counter */ 00190 k--; 00191 } 00192 00193 /* If the count is not a multiple of 4, compute any remaining MACs here. 00194 ** No loop unrolling is used. */ 00195 k = count % 0x4u; 00196 00197 while(k > 0u) 00198 { 00199 /* Perform the multiply-accumulate */ 00200 sum += (q63_t) * px++ * (*py--); 00201 00202 /* Decrement the loop counter */ 00203 k--; 00204 } 00205 00206 /* Store the result in the accumulator in the destination buffer. */ 00207 *pOut++ = (q31_t) (sum >> 31); 00208 00209 /* Update the inputA and inputB pointers for next MAC calculation */ 00210 py = pIn2 + count; 00211 px = pIn1; 00212 00213 /* Increment the MAC count */ 00214 count++; 00215 00216 /* Decrement the loop counter */ 00217 blockSize1--; 00218 } 00219 00220 /* -------------------------- 00221 * Initializations of stage2 00222 * ------------------------*/ 00223 00224 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00225 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00226 * .... 00227 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00228 */ 00229 00230 /* Working pointer of inputA */ 00231 px = pIn1; 00232 00233 /* Working pointer of inputB */ 00234 pSrc2 = pIn2 + (srcBLen - 1u); 00235 py = pSrc2; 00236 00237 /* count is index by which the pointer pIn1 to be incremented */ 00238 count = 0u; 00239 00240 /* ------------------- 00241 * Stage2 process 00242 * ------------------*/ 00243 00244 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00245 * So, to loop unroll over blockSize2, 00246 * srcBLen should be greater than or equal to 4 */ 00247 if(srcBLen >= 4u) 00248 { 00249 /* Loop unroll by 3 */ 00250 blkCnt = blockSize2 / 3; 00251 00252 while(blkCnt > 0u) 00253 { 00254 /* Set all accumulators to zero */ 00255 acc0 = 0; 00256 acc1 = 0; 00257 acc2 = 0; 00258 00259 /* read x[0], x[1], x[2] samples */ 00260 x0 = *(px++); 00261 x1 = *(px++); 00262 00263 /* Apply loop unrolling and compute 3 MACs simultaneously. */ 00264 k = srcBLen / 3; 00265 00266 /* First part of the processing with loop unrolling. Compute 3 MACs at a time. 00267 ** a second loop below computes MACs for the remaining 1 to 2 samples. */ 00268 do 00269 { 00270 /* Read y[srcBLen - 1] sample */ 00271 c0 = *(py); 00272 00273 /* Read x[3] sample */ 00274 x2 = *(px); 00275 00276 /* Perform the multiply-accumulates */ 00277 /* acc0 += x[0] * y[srcBLen - 1] */ 00278 acc0 += ((q63_t) x0 * c0); 00279 /* acc1 += x[1] * y[srcBLen - 1] */ 00280 acc1 += ((q63_t) x1 * c0); 00281 /* acc2 += x[2] * y[srcBLen - 1] */ 00282 acc2 += ((q63_t) x2 * c0); 00283 00284 /* Read y[srcBLen - 2] sample */ 00285 c0 = *(py - 1u); 00286 00287 /* Read x[4] sample */ 00288 x0 = *(px + 1u); 00289 00290 /* Perform the multiply-accumulate */ 00291 /* acc0 += x[1] * y[srcBLen - 2] */ 00292 acc0 += ((q63_t) x1 * c0); 00293 /* acc1 += x[2] * y[srcBLen - 2] */ 00294 acc1 += ((q63_t) x2 * c0); 00295 /* acc2 += x[3] * y[srcBLen - 2] */ 00296 acc2 += ((q63_t) x0 * c0); 00297 00298 /* Read y[srcBLen - 3] sample */ 00299 c0 = *(py - 2u); 00300 00301 /* Read x[5] sample */ 00302 x1 = *(px + 2u); 00303 00304 /* Perform the multiply-accumulates */ 00305 /* acc0 += x[2] * y[srcBLen - 3] */ 00306 acc0 += ((q63_t) x2 * c0); 00307 /* acc1 += x[3] * y[srcBLen - 2] */ 00308 acc1 += ((q63_t) x0 * c0); 00309 /* acc2 += x[4] * y[srcBLen - 2] */ 00310 acc2 += ((q63_t) x1 * c0); 00311 00312 /* update scratch pointers */ 00313 px += 3u; 00314 py -= 3u; 00315 00316 } while(--k); 00317 00318 /* If the srcBLen is not a multiple of 3, compute any remaining MACs here. 00319 ** No loop unrolling is used. */ 00320 k = srcBLen - (3 * (srcBLen / 3)); 00321 00322 while(k > 0u) 00323 { 00324 /* Read y[srcBLen - 5] sample */ 00325 c0 = *(py--); 00326 00327 /* Read x[7] sample */ 00328 x2 = *(px++); 00329 00330 /* Perform the multiply-accumulates */ 00331 /* acc0 += x[4] * y[srcBLen - 5] */ 00332 acc0 += ((q63_t) x0 * c0); 00333 /* acc1 += x[5] * y[srcBLen - 5] */ 00334 acc1 += ((q63_t) x1 * c0); 00335 /* acc2 += x[6] * y[srcBLen - 5] */ 00336 acc2 += ((q63_t) x2 * c0); 00337 00338 /* Reuse the present samples for the next MAC */ 00339 x0 = x1; 00340 x1 = x2; 00341 00342 /* Decrement the loop counter */ 00343 k--; 00344 } 00345 00346 /* Store the results in the accumulators in the destination buffer. */ 00347 *pOut++ = (q31_t) (acc0 >> 31); 00348 *pOut++ = (q31_t) (acc1 >> 31); 00349 *pOut++ = (q31_t) (acc2 >> 31); 00350 00351 /* Increment the pointer pIn1 index, count by 3 */ 00352 count += 3u; 00353 00354 /* Update the inputA and inputB pointers for next MAC calculation */ 00355 px = pIn1 + count; 00356 py = pSrc2; 00357 00358 /* Decrement the loop counter */ 00359 blkCnt--; 00360 } 00361 00362 /* If the blockSize2 is not a multiple of 3, compute any remaining output samples here. 00363 ** No loop unrolling is used. */ 00364 blkCnt = blockSize2 - 3 * (blockSize2 / 3); 00365 00366 while(blkCnt > 0u) 00367 { 00368 /* Accumulator is made zero for every iteration */ 00369 sum = 0; 00370 00371 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00372 k = srcBLen >> 2u; 00373 00374 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00375 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00376 while(k > 0u) 00377 { 00378 /* Perform the multiply-accumulates */ 00379 sum += (q63_t) * px++ * (*py--); 00380 sum += (q63_t) * px++ * (*py--); 00381 sum += (q63_t) * px++ * (*py--); 00382 sum += (q63_t) * px++ * (*py--); 00383 00384 /* Decrement the loop counter */ 00385 k--; 00386 } 00387 00388 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00389 ** No loop unrolling is used. */ 00390 k = srcBLen % 0x4u; 00391 00392 while(k > 0u) 00393 { 00394 /* Perform the multiply-accumulate */ 00395 sum += (q63_t) * px++ * (*py--); 00396 00397 /* Decrement the loop counter */ 00398 k--; 00399 } 00400 00401 /* Store the result in the accumulator in the destination buffer. */ 00402 *pOut++ = (q31_t) (sum >> 31); 00403 00404 /* Increment the MAC count */ 00405 count++; 00406 00407 /* Update the inputA and inputB pointers for next MAC calculation */ 00408 px = pIn1 + count; 00409 py = pSrc2; 00410 00411 /* Decrement the loop counter */ 00412 blkCnt--; 00413 } 00414 } 00415 else 00416 { 00417 /* If the srcBLen is not a multiple of 4, 00418 * the blockSize2 loop cannot be unrolled by 4 */ 00419 blkCnt = blockSize2; 00420 00421 while(blkCnt > 0u) 00422 { 00423 /* Accumulator is made zero for every iteration */ 00424 sum = 0; 00425 00426 /* srcBLen number of MACS should be performed */ 00427 k = srcBLen; 00428 00429 while(k > 0u) 00430 { 00431 /* Perform the multiply-accumulate */ 00432 sum += (q63_t) * px++ * (*py--); 00433 00434 /* Decrement the loop counter */ 00435 k--; 00436 } 00437 00438 /* Store the result in the accumulator in the destination buffer. */ 00439 *pOut++ = (q31_t) (sum >> 31); 00440 00441 /* Increment the MAC count */ 00442 count++; 00443 00444 /* Update the inputA and inputB pointers for next MAC calculation */ 00445 px = pIn1 + count; 00446 py = pSrc2; 00447 00448 /* Decrement the loop counter */ 00449 blkCnt--; 00450 } 00451 } 00452 00453 00454 /* -------------------------- 00455 * Initializations of stage3 00456 * -------------------------*/ 00457 00458 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 00459 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 00460 * .... 00461 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 00462 * sum += x[srcALen-1] * y[srcBLen-1] 00463 */ 00464 00465 /* In this stage the MAC operations are decreased by 1 for every iteration. 00466 The blockSize3 variable holds the number of MAC operations performed */ 00467 00468 /* Working pointer of inputA */ 00469 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 00470 px = pSrc1; 00471 00472 /* Working pointer of inputB */ 00473 pSrc2 = pIn2 + (srcBLen - 1u); 00474 py = pSrc2; 00475 00476 /* ------------------- 00477 * Stage3 process 00478 * ------------------*/ 00479 00480 while(blockSize3 > 0u) 00481 { 00482 /* Accumulator is made zero for every iteration */ 00483 sum = 0; 00484 00485 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00486 k = blockSize3 >> 2u; 00487 00488 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00489 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00490 while(k > 0u) 00491 { 00492 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ 00493 sum += (q63_t) * px++ * (*py--); 00494 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ 00495 sum += (q63_t) * px++ * (*py--); 00496 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ 00497 sum += (q63_t) * px++ * (*py--); 00498 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ 00499 sum += (q63_t) * px++ * (*py--); 00500 00501 /* Decrement the loop counter */ 00502 k--; 00503 } 00504 00505 /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here. 00506 ** No loop unrolling is used. */ 00507 k = blockSize3 % 0x4u; 00508 00509 while(k > 0u) 00510 { 00511 /* Perform the multiply-accumulate */ 00512 sum += (q63_t) * px++ * (*py--); 00513 00514 /* Decrement the loop counter */ 00515 k--; 00516 } 00517 00518 /* Store the result in the accumulator in the destination buffer. */ 00519 *pOut++ = (q31_t) (sum >> 31); 00520 00521 /* Update the inputA and inputB pointers for next MAC calculation */ 00522 px = ++pSrc1; 00523 py = pSrc2; 00524 00525 /* Decrement the loop counter */ 00526 blockSize3--; 00527 } 00528 00529 #else 00530 00531 /* Run the below code for Cortex-M0 */ 00532 00533 q31_t *pIn1 = pSrcA; /* input pointer */ 00534 q31_t *pIn2 = pSrcB; /* coefficient pointer */ 00535 q63_t sum; /* Accumulator */ 00536 uint32_t i, j; /* loop counter */ 00537 00538 /* Loop to calculate output of convolution for output length number of times */ 00539 for (i = 0; i < (srcALen + srcBLen - 1); i++) 00540 { 00541 /* Initialize sum with zero to carry on MAC operations */ 00542 sum = 0; 00543 00544 /* Loop to perform MAC operations according to convolution equation */ 00545 for (j = 0; j <= i; j++) 00546 { 00547 /* Check the array limitations */ 00548 if(((i - j) < srcBLen) && (j < srcALen)) 00549 { 00550 /* z[i] += x[i-j] * y[j] */ 00551 sum += ((q63_t) pIn1[j] * (pIn2[i - j])); 00552 } 00553 } 00554 00555 /* Store the output in the destination buffer */ 00556 pDst[i] = (q31_t) (sum >> 31u); 00557 } 00558 00559 #endif /* #ifndef ARM_MATH_CM0_FAMILY */ 00560 00561 } 00562 00563 /** 00564 * @} end of Conv group 00565 */
Generated on Tue Jul 12 2022 13:15:23 by
