Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
Fork of mbed-dsp by
arm_correlate_q31.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010-2013 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 17. January 2013 00005 * $Revision: V1.4.1 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_correlate_q31.c 00009 * 00010 * Description: Correlation of Q31 sequences. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * -------------------------------------------------------------------- */ 00040 00041 #include "arm_math.h" 00042 00043 /** 00044 * @ingroup groupFilters 00045 */ 00046 00047 /** 00048 * @addtogroup Corr 00049 * @{ 00050 */ 00051 00052 /** 00053 * @brief Correlation of Q31 sequences. 00054 * @param[in] *pSrcA points to the first input sequence. 00055 * @param[in] srcALen length of the first input sequence. 00056 * @param[in] *pSrcB points to the second input sequence. 00057 * @param[in] srcBLen length of the second input sequence. 00058 * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1. 00059 * @return none. 00060 * 00061 * @details 00062 * <b>Scaling and Overflow Behavior:</b> 00063 * 00064 * \par 00065 * The function is implemented using an internal 64-bit accumulator. 00066 * The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit. 00067 * There is no saturation on intermediate additions. 00068 * Thus, if the accumulator overflows it wraps around and distorts the result. 00069 * The input signals should be scaled down to avoid intermediate overflows. 00070 * Scale down one of the inputs by 1/min(srcALen, srcBLen)to avoid overflows since a 00071 * maximum of min(srcALen, srcBLen) number of additions is carried internally. 00072 * The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result. 00073 * 00074 * \par 00075 * See <code>arm_correlate_fast_q31()</code> for a faster but less precise implementation of this function for Cortex-M3 and Cortex-M4. 00076 */ 00077 00078 void arm_correlate_q31( 00079 q31_t * pSrcA, 00080 uint32_t srcALen, 00081 q31_t * pSrcB, 00082 uint32_t srcBLen, 00083 q31_t * pDst) 00084 { 00085 00086 #ifndef ARM_MATH_CM0_FAMILY 00087 00088 /* Run the below code for Cortex-M4 and Cortex-M3 */ 00089 00090 q31_t *pIn1; /* inputA pointer */ 00091 q31_t *pIn2; /* inputB pointer */ 00092 q31_t *pOut = pDst; /* output pointer */ 00093 q31_t *px; /* Intermediate inputA pointer */ 00094 q31_t *py; /* Intermediate inputB pointer */ 00095 q31_t *pSrc1; /* Intermediate pointers */ 00096 q63_t sum, acc0, acc1, acc2; /* Accumulators */ 00097 q31_t x0, x1, x2, c0; /* temporary variables for holding input and coefficient values */ 00098 uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counter */ 00099 int32_t inc = 1; /* Destination address modifier */ 00100 00101 00102 /* The algorithm implementation is based on the lengths of the inputs. */ 00103 /* srcB is always made to slide across srcA. */ 00104 /* So srcBLen is always considered as shorter or equal to srcALen */ 00105 /* But CORR(x, y) is reverse of CORR(y, x) */ 00106 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ 00107 /* and the destination pointer modifier, inc is set to -1 */ 00108 /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */ 00109 /* But to improve the performance, 00110 * we include zeroes in the output instead of zero padding either of the the inputs*/ 00111 /* If srcALen > srcBLen, 00112 * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */ 00113 /* If srcALen < srcBLen, 00114 * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */ 00115 if(srcALen >= srcBLen) 00116 { 00117 /* Initialization of inputA pointer */ 00118 pIn1 = (pSrcA); 00119 00120 /* Initialization of inputB pointer */ 00121 pIn2 = (pSrcB); 00122 00123 /* Number of output samples is calculated */ 00124 outBlockSize = (2u * srcALen) - 1u; 00125 00126 /* When srcALen > srcBLen, zero padding is done to srcB 00127 * to make their lengths equal. 00128 * Instead, (outBlockSize - (srcALen + srcBLen - 1)) 00129 * number of output samples are made zero */ 00130 j = outBlockSize - (srcALen + (srcBLen - 1u)); 00131 00132 /* Updating the pointer position to non zero value */ 00133 pOut += j; 00134 00135 } 00136 else 00137 { 00138 /* Initialization of inputA pointer */ 00139 pIn1 = (pSrcB); 00140 00141 /* Initialization of inputB pointer */ 00142 pIn2 = (pSrcA); 00143 00144 /* srcBLen is always considered as shorter or equal to srcALen */ 00145 j = srcBLen; 00146 srcBLen = srcALen; 00147 srcALen = j; 00148 00149 /* CORR(x, y) = Reverse order(CORR(y, x)) */ 00150 /* Hence set the destination pointer to point to the last output sample */ 00151 pOut = pDst + ((srcALen + srcBLen) - 2u); 00152 00153 /* Destination address modifier is set to -1 */ 00154 inc = -1; 00155 00156 } 00157 00158 /* The function is internally 00159 * divided into three parts according to the number of multiplications that has to be 00160 * taken place between inputA samples and inputB samples. In the first part of the 00161 * algorithm, the multiplications increase by one for every iteration. 00162 * In the second part of the algorithm, srcBLen number of multiplications are done. 00163 * In the third part of the algorithm, the multiplications decrease by one 00164 * for every iteration.*/ 00165 /* The algorithm is implemented in three stages. 00166 * The loop counters of each stage is initiated here. */ 00167 blockSize1 = srcBLen - 1u; 00168 blockSize2 = srcALen - (srcBLen - 1u); 00169 blockSize3 = blockSize1; 00170 00171 /* -------------------------- 00172 * Initializations of stage1 00173 * -------------------------*/ 00174 00175 /* sum = x[0] * y[srcBlen - 1] 00176 * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1] 00177 * .... 00178 * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1] 00179 */ 00180 00181 /* In this stage the MAC operations are increased by 1 for every iteration. 00182 The count variable holds the number of MAC operations performed */ 00183 count = 1u; 00184 00185 /* Working pointer of inputA */ 00186 px = pIn1; 00187 00188 /* Working pointer of inputB */ 00189 pSrc1 = pIn2 + (srcBLen - 1u); 00190 py = pSrc1; 00191 00192 /* ------------------------ 00193 * Stage1 process 00194 * ----------------------*/ 00195 00196 /* The first stage starts here */ 00197 while(blockSize1 > 0u) 00198 { 00199 /* Accumulator is made zero for every iteration */ 00200 sum = 0; 00201 00202 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00203 k = count >> 2; 00204 00205 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00206 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00207 while(k > 0u) 00208 { 00209 /* x[0] * y[srcBLen - 4] */ 00210 sum += (q63_t) * px++ * (*py++); 00211 /* x[1] * y[srcBLen - 3] */ 00212 sum += (q63_t) * px++ * (*py++); 00213 /* x[2] * y[srcBLen - 2] */ 00214 sum += (q63_t) * px++ * (*py++); 00215 /* x[3] * y[srcBLen - 1] */ 00216 sum += (q63_t) * px++ * (*py++); 00217 00218 /* Decrement the loop counter */ 00219 k--; 00220 } 00221 00222 /* If the count is not a multiple of 4, compute any remaining MACs here. 00223 ** No loop unrolling is used. */ 00224 k = count % 0x4u; 00225 00226 while(k > 0u) 00227 { 00228 /* Perform the multiply-accumulates */ 00229 /* x[0] * y[srcBLen - 1] */ 00230 sum += (q63_t) * px++ * (*py++); 00231 00232 /* Decrement the loop counter */ 00233 k--; 00234 } 00235 00236 /* Store the result in the accumulator in the destination buffer. */ 00237 *pOut = (q31_t) (sum >> 31); 00238 /* Destination pointer is updated according to the address modifier, inc */ 00239 pOut += inc; 00240 00241 /* Update the inputA and inputB pointers for next MAC calculation */ 00242 py = pSrc1 - count; 00243 px = pIn1; 00244 00245 /* Increment the MAC count */ 00246 count++; 00247 00248 /* Decrement the loop counter */ 00249 blockSize1--; 00250 } 00251 00252 /* -------------------------- 00253 * Initializations of stage2 00254 * ------------------------*/ 00255 00256 /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1] 00257 * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1] 00258 * .... 00259 * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] 00260 */ 00261 00262 /* Working pointer of inputA */ 00263 px = pIn1; 00264 00265 /* Working pointer of inputB */ 00266 py = pIn2; 00267 00268 /* count is index by which the pointer pIn1 to be incremented */ 00269 count = 0u; 00270 00271 /* ------------------- 00272 * Stage2 process 00273 * ------------------*/ 00274 00275 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00276 * So, to loop unroll over blockSize2, 00277 * srcBLen should be greater than or equal to 4 */ 00278 if(srcBLen >= 4u) 00279 { 00280 /* Loop unroll by 3 */ 00281 blkCnt = blockSize2 / 3; 00282 00283 while(blkCnt > 0u) 00284 { 00285 /* Set all accumulators to zero */ 00286 acc0 = 0; 00287 acc1 = 0; 00288 acc2 = 0; 00289 00290 /* read x[0], x[1] samples */ 00291 x0 = *(px++); 00292 x1 = *(px++); 00293 00294 /* Apply loop unrolling and compute 3 MACs simultaneously. */ 00295 k = srcBLen / 3; 00296 00297 /* First part of the processing with loop unrolling. Compute 3 MACs at a time. 00298 ** a second loop below computes MACs for the remaining 1 to 2 samples. */ 00299 do 00300 { 00301 /* Read y[0] sample */ 00302 c0 = *(py); 00303 00304 /* Read x[2] sample */ 00305 x2 = *(px); 00306 00307 /* Perform the multiply-accumulate */ 00308 /* acc0 += x[0] * y[0] */ 00309 acc0 += ((q63_t) x0 * c0); 00310 /* acc1 += x[1] * y[0] */ 00311 acc1 += ((q63_t) x1 * c0); 00312 /* acc2 += x[2] * y[0] */ 00313 acc2 += ((q63_t) x2 * c0); 00314 00315 /* Read y[1] sample */ 00316 c0 = *(py + 1u); 00317 00318 /* Read x[3] sample */ 00319 x0 = *(px + 1u); 00320 00321 /* Perform the multiply-accumulates */ 00322 /* acc0 += x[1] * y[1] */ 00323 acc0 += ((q63_t) x1 * c0); 00324 /* acc1 += x[2] * y[1] */ 00325 acc1 += ((q63_t) x2 * c0); 00326 /* acc2 += x[3] * y[1] */ 00327 acc2 += ((q63_t) x0 * c0); 00328 00329 /* Read y[2] sample */ 00330 c0 = *(py + 2u); 00331 00332 /* Read x[4] sample */ 00333 x1 = *(px + 2u); 00334 00335 /* Perform the multiply-accumulates */ 00336 /* acc0 += x[2] * y[2] */ 00337 acc0 += ((q63_t) x2 * c0); 00338 /* acc1 += x[3] * y[2] */ 00339 acc1 += ((q63_t) x0 * c0); 00340 /* acc2 += x[4] * y[2] */ 00341 acc2 += ((q63_t) x1 * c0); 00342 00343 /* update scratch pointers */ 00344 px += 3u; 00345 py += 3u; 00346 00347 } while(--k); 00348 00349 /* If the srcBLen is not a multiple of 3, compute any remaining MACs here. 00350 ** No loop unrolling is used. */ 00351 k = srcBLen - (3 * (srcBLen / 3)); 00352 00353 while(k > 0u) 00354 { 00355 /* Read y[4] sample */ 00356 c0 = *(py++); 00357 00358 /* Read x[7] sample */ 00359 x2 = *(px++); 00360 00361 /* Perform the multiply-accumulates */ 00362 /* acc0 += x[4] * y[4] */ 00363 acc0 += ((q63_t) x0 * c0); 00364 /* acc1 += x[5] * y[4] */ 00365 acc1 += ((q63_t) x1 * c0); 00366 /* acc2 += x[6] * y[4] */ 00367 acc2 += ((q63_t) x2 * c0); 00368 00369 /* Reuse the present samples for the next MAC */ 00370 x0 = x1; 00371 x1 = x2; 00372 00373 /* Decrement the loop counter */ 00374 k--; 00375 } 00376 00377 /* Store the result in the accumulator in the destination buffer. */ 00378 *pOut = (q31_t) (acc0 >> 31); 00379 /* Destination pointer is updated according to the address modifier, inc */ 00380 pOut += inc; 00381 00382 *pOut = (q31_t) (acc1 >> 31); 00383 pOut += inc; 00384 00385 *pOut = (q31_t) (acc2 >> 31); 00386 pOut += inc; 00387 00388 /* Increment the pointer pIn1 index, count by 3 */ 00389 count += 3u; 00390 00391 /* Update the inputA and inputB pointers for next MAC calculation */ 00392 px = pIn1 + count; 00393 py = pIn2; 00394 00395 00396 /* Decrement the loop counter */ 00397 blkCnt--; 00398 } 00399 00400 /* If the blockSize2 is not a multiple of 3, compute any remaining output samples here. 00401 ** No loop unrolling is used. */ 00402 blkCnt = blockSize2 - 3 * (blockSize2 / 3); 00403 00404 while(blkCnt > 0u) 00405 { 00406 /* Accumulator is made zero for every iteration */ 00407 sum = 0; 00408 00409 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00410 k = srcBLen >> 2u; 00411 00412 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00413 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00414 while(k > 0u) 00415 { 00416 /* Perform the multiply-accumulates */ 00417 sum += (q63_t) * px++ * (*py++); 00418 sum += (q63_t) * px++ * (*py++); 00419 sum += (q63_t) * px++ * (*py++); 00420 sum += (q63_t) * px++ * (*py++); 00421 00422 /* Decrement the loop counter */ 00423 k--; 00424 } 00425 00426 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00427 ** No loop unrolling is used. */ 00428 k = srcBLen % 0x4u; 00429 00430 while(k > 0u) 00431 { 00432 /* Perform the multiply-accumulate */ 00433 sum += (q63_t) * px++ * (*py++); 00434 00435 /* Decrement the loop counter */ 00436 k--; 00437 } 00438 00439 /* Store the result in the accumulator in the destination buffer. */ 00440 *pOut = (q31_t) (sum >> 31); 00441 /* Destination pointer is updated according to the address modifier, inc */ 00442 pOut += inc; 00443 00444 /* Increment the MAC count */ 00445 count++; 00446 00447 /* Update the inputA and inputB pointers for next MAC calculation */ 00448 px = pIn1 + count; 00449 py = pIn2; 00450 00451 /* Decrement the loop counter */ 00452 blkCnt--; 00453 } 00454 } 00455 else 00456 { 00457 /* If the srcBLen is not a multiple of 4, 00458 * the blockSize2 loop cannot be unrolled by 4 */ 00459 blkCnt = blockSize2; 00460 00461 while(blkCnt > 0u) 00462 { 00463 /* Accumulator is made zero for every iteration */ 00464 sum = 0; 00465 00466 /* Loop over srcBLen */ 00467 k = srcBLen; 00468 00469 while(k > 0u) 00470 { 00471 /* Perform the multiply-accumulate */ 00472 sum += (q63_t) * px++ * (*py++); 00473 00474 /* Decrement the loop counter */ 00475 k--; 00476 } 00477 00478 /* Store the result in the accumulator in the destination buffer. */ 00479 *pOut = (q31_t) (sum >> 31); 00480 /* Destination pointer is updated according to the address modifier, inc */ 00481 pOut += inc; 00482 00483 /* Increment the MAC count */ 00484 count++; 00485 00486 /* Update the inputA and inputB pointers for next MAC calculation */ 00487 px = pIn1 + count; 00488 py = pIn2; 00489 00490 /* Decrement the loop counter */ 00491 blkCnt--; 00492 } 00493 } 00494 00495 /* -------------------------- 00496 * Initializations of stage3 00497 * -------------------------*/ 00498 00499 /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] 00500 * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] 00501 * .... 00502 * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1] 00503 * sum += x[srcALen-1] * y[0] 00504 */ 00505 00506 /* In this stage the MAC operations are decreased by 1 for every iteration. 00507 The count variable holds the number of MAC operations performed */ 00508 count = srcBLen - 1u; 00509 00510 /* Working pointer of inputA */ 00511 pSrc1 = pIn1 + (srcALen - (srcBLen - 1u)); 00512 px = pSrc1; 00513 00514 /* Working pointer of inputB */ 00515 py = pIn2; 00516 00517 /* ------------------- 00518 * Stage3 process 00519 * ------------------*/ 00520 00521 while(blockSize3 > 0u) 00522 { 00523 /* Accumulator is made zero for every iteration */ 00524 sum = 0; 00525 00526 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00527 k = count >> 2u; 00528 00529 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00530 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00531 while(k > 0u) 00532 { 00533 /* Perform the multiply-accumulates */ 00534 /* sum += x[srcALen - srcBLen + 4] * y[3] */ 00535 sum += (q63_t) * px++ * (*py++); 00536 /* sum += x[srcALen - srcBLen + 3] * y[2] */ 00537 sum += (q63_t) * px++ * (*py++); 00538 /* sum += x[srcALen - srcBLen + 2] * y[1] */ 00539 sum += (q63_t) * px++ * (*py++); 00540 /* sum += x[srcALen - srcBLen + 1] * y[0] */ 00541 sum += (q63_t) * px++ * (*py++); 00542 00543 /* Decrement the loop counter */ 00544 k--; 00545 } 00546 00547 /* If the count is not a multiple of 4, compute any remaining MACs here. 00548 ** No loop unrolling is used. */ 00549 k = count % 0x4u; 00550 00551 while(k > 0u) 00552 { 00553 /* Perform the multiply-accumulates */ 00554 sum += (q63_t) * px++ * (*py++); 00555 00556 /* Decrement the loop counter */ 00557 k--; 00558 } 00559 00560 /* Store the result in the accumulator in the destination buffer. */ 00561 *pOut = (q31_t) (sum >> 31); 00562 /* Destination pointer is updated according to the address modifier, inc */ 00563 pOut += inc; 00564 00565 /* Update the inputA and inputB pointers for next MAC calculation */ 00566 px = ++pSrc1; 00567 py = pIn2; 00568 00569 /* Decrement the MAC count */ 00570 count--; 00571 00572 /* Decrement the loop counter */ 00573 blockSize3--; 00574 } 00575 00576 #else 00577 00578 /* Run the below code for Cortex-M0 */ 00579 00580 q31_t *pIn1 = pSrcA; /* inputA pointer */ 00581 q31_t *pIn2 = pSrcB + (srcBLen - 1u); /* inputB pointer */ 00582 q63_t sum; /* Accumulators */ 00583 uint32_t i = 0u, j; /* loop counters */ 00584 uint32_t inv = 0u; /* Reverse order flag */ 00585 uint32_t tot = 0u; /* Length */ 00586 00587 /* The algorithm implementation is based on the lengths of the inputs. */ 00588 /* srcB is always made to slide across srcA. */ 00589 /* So srcBLen is always considered as shorter or equal to srcALen */ 00590 /* But CORR(x, y) is reverse of CORR(y, x) */ 00591 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ 00592 /* and a varaible, inv is set to 1 */ 00593 /* If lengths are not equal then zero pad has to be done to make the two 00594 * inputs of same length. But to improve the performance, we include zeroes 00595 * in the output instead of zero padding either of the the inputs*/ 00596 /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the 00597 * starting of the output buffer */ 00598 /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the 00599 * ending of the output buffer */ 00600 /* Once the zero padding is done the remaining of the output is calcualted 00601 * using correlation but with the shorter signal time shifted. */ 00602 00603 /* Calculate the length of the remaining sequence */ 00604 tot = ((srcALen + srcBLen) - 2u); 00605 00606 if(srcALen > srcBLen) 00607 { 00608 /* Calculating the number of zeros to be padded to the output */ 00609 j = srcALen - srcBLen; 00610 00611 /* Initialise the pointer after zero padding */ 00612 pDst += j; 00613 } 00614 00615 else if(srcALen < srcBLen) 00616 { 00617 /* Initialization to inputB pointer */ 00618 pIn1 = pSrcB; 00619 00620 /* Initialization to the end of inputA pointer */ 00621 pIn2 = pSrcA + (srcALen - 1u); 00622 00623 /* Initialisation of the pointer after zero padding */ 00624 pDst = pDst + tot; 00625 00626 /* Swapping the lengths */ 00627 j = srcALen; 00628 srcALen = srcBLen; 00629 srcBLen = j; 00630 00631 /* Setting the reverse flag */ 00632 inv = 1; 00633 00634 } 00635 00636 /* Loop to calculate correlation for output length number of times */ 00637 for (i = 0u; i <= tot; i++) 00638 { 00639 /* Initialize sum with zero to carry on MAC operations */ 00640 sum = 0; 00641 00642 /* Loop to perform MAC operations according to correlation equation */ 00643 for (j = 0u; j <= i; j++) 00644 { 00645 /* Check the array limitations */ 00646 if((((i - j) < srcBLen) && (j < srcALen))) 00647 { 00648 /* z[i] += x[i-j] * y[j] */ 00649 sum += ((q63_t) pIn1[j] * pIn2[-((int32_t) i - j)]); 00650 } 00651 } 00652 /* Store the output in the destination buffer */ 00653 if(inv == 1) 00654 *pDst-- = (q31_t) (sum >> 31u); 00655 else 00656 *pDst++ = (q31_t) (sum >> 31u); 00657 } 00658 00659 #endif /* #ifndef ARM_MATH_CM0_FAMILY */ 00660 00661 } 00662 00663 /** 00664 * @} end of Corr group 00665 */
Generated on Tue Jul 12 2022 18:44:08 by
1.7.2
