CMSIS DSP library
Dependents: KL25Z_FFT_Demo Hat_Board_v5_1 KL25Z_FFT_Demo_tony KL25Z_FFT_Demo_tony ... more
Fork of mbed-dsp by
arm_correlate_f32.c
00001 /* ---------------------------------------------------------------------------- 00002 * Copyright (C) 2010-2013 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 17. January 2013 00005 * $Revision: V1.4.1 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_correlate_f32.c 00009 * 00010 * Description: Correlation of floating-point sequences. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 00013 * 00014 * Redistribution and use in source and binary forms, with or without 00015 * modification, are permitted provided that the following conditions 00016 * are met: 00017 * - Redistributions of source code must retain the above copyright 00018 * notice, this list of conditions and the following disclaimer. 00019 * - Redistributions in binary form must reproduce the above copyright 00020 * notice, this list of conditions and the following disclaimer in 00021 * the documentation and/or other materials provided with the 00022 * distribution. 00023 * - Neither the name of ARM LIMITED nor the names of its contributors 00024 * may be used to endorse or promote products derived from this 00025 * software without specific prior written permission. 00026 * 00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00038 * POSSIBILITY OF SUCH DAMAGE. 00039 * -------------------------------------------------------------------------- */ 00040 00041 #include "arm_math.h" 00042 00043 /** 00044 * @ingroup groupFilters 00045 */ 00046 00047 /** 00048 * @defgroup Corr Correlation 00049 * 00050 * Correlation is a mathematical operation that is similar to convolution. 00051 * As with convolution, correlation uses two signals to produce a third signal. 00052 * The underlying algorithms in correlation and convolution are identical except that one of the inputs is flipped in convolution. 00053 * Correlation is commonly used to measure the similarity between two signals. 00054 * It has applications in pattern recognition, cryptanalysis, and searching. 00055 * The CMSIS library provides correlation functions for Q7, Q15, Q31 and floating-point data types. 00056 * Fast versions of the Q15 and Q31 functions are also provided. 00057 * 00058 * \par Algorithm 00059 * Let <code>a[n]</code> and <code>b[n]</code> be sequences of length <code>srcALen</code> and <code>srcBLen</code> samples respectively. 00060 * The convolution of the two signals is denoted by 00061 * <pre> 00062 * c[n] = a[n] * b[n] 00063 * </pre> 00064 * In correlation, one of the signals is flipped in time 00065 * <pre> 00066 * c[n] = a[n] * b[-n] 00067 * </pre> 00068 * 00069 * \par 00070 * and this is mathematically defined as 00071 * \image html CorrelateEquation.gif 00072 * \par 00073 * The <code>pSrcA</code> points to the first input vector of length <code>srcALen</code> and <code>pSrcB</code> points to the second input vector of length <code>srcBLen</code>. 00074 * The result <code>c[n]</code> is of length <code>2 * max(srcALen, srcBLen) - 1</code> and is defined over the interval <code>n=0, 1, 2, ..., (2 * max(srcALen, srcBLen) - 2)</code>. 00075 * The output result is written to <code>pDst</code> and the calling function must allocate <code>2 * max(srcALen, srcBLen) - 1</code> words for the result. 00076 * 00077 * <b>Note</b> 00078 * \par 00079 * The <code>pDst</code> should be initialized to all zeros before being used. 00080 * 00081 * <b>Fixed-Point Behavior</b> 00082 * \par 00083 * Correlation requires summing up a large number of intermediate products. 00084 * As such, the Q7, Q15, and Q31 functions run a risk of overflow and saturation. 00085 * Refer to the function specific documentation below for further details of the particular algorithm used. 00086 * 00087 * 00088 * <b>Fast Versions</b> 00089 * 00090 * \par 00091 * Fast versions are supported for Q31 and Q15. Cycles for Fast versions are less compared to Q31 and Q15 of correlate and the design requires 00092 * the input signals should be scaled down to avoid intermediate overflows. 00093 * 00094 * 00095 * <b>Opt Versions</b> 00096 * 00097 * \par 00098 * Opt versions are supported for Q15 and Q7. Design uses internal scratch buffer for getting good optimisation. 00099 * These versions are optimised in cycles and consumes more memory(Scratch memory) compared to Q15 and Q7 versions of correlate 00100 */ 00101 00102 /** 00103 * @addtogroup Corr 00104 * @{ 00105 */ 00106 /** 00107 * @brief Correlation of floating-point sequences. 00108 * @param[in] *pSrcA points to the first input sequence. 00109 * @param[in] srcALen length of the first input sequence. 00110 * @param[in] *pSrcB points to the second input sequence. 00111 * @param[in] srcBLen length of the second input sequence. 00112 * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1. 00113 * @return none. 00114 */ 00115 00116 void arm_correlate_f32( 00117 float32_t * pSrcA, 00118 uint32_t srcALen, 00119 float32_t * pSrcB, 00120 uint32_t srcBLen, 00121 float32_t * pDst) 00122 { 00123 00124 00125 #ifndef ARM_MATH_CM0_FAMILY 00126 00127 /* Run the below code for Cortex-M4 and Cortex-M3 */ 00128 00129 float32_t *pIn1; /* inputA pointer */ 00130 float32_t *pIn2; /* inputB pointer */ 00131 float32_t *pOut = pDst; /* output pointer */ 00132 float32_t *px; /* Intermediate inputA pointer */ 00133 float32_t *py; /* Intermediate inputB pointer */ 00134 float32_t *pSrc1; /* Intermediate pointers */ 00135 float32_t sum, acc0, acc1, acc2, acc3; /* Accumulators */ 00136 float32_t x0, x1, x2, x3, c0; /* temporary variables for holding input and coefficient values */ 00137 uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counters */ 00138 int32_t inc = 1; /* Destination address modifier */ 00139 00140 00141 /* The algorithm implementation is based on the lengths of the inputs. */ 00142 /* srcB is always made to slide across srcA. */ 00143 /* So srcBLen is always considered as shorter or equal to srcALen */ 00144 /* But CORR(x, y) is reverse of CORR(y, x) */ 00145 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ 00146 /* and the destination pointer modifier, inc is set to -1 */ 00147 /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */ 00148 /* But to improve the performance, 00149 * we include zeroes in the output instead of zero padding either of the the inputs*/ 00150 /* If srcALen > srcBLen, 00151 * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */ 00152 /* If srcALen < srcBLen, 00153 * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */ 00154 if(srcALen >= srcBLen) 00155 { 00156 /* Initialization of inputA pointer */ 00157 pIn1 = pSrcA; 00158 00159 /* Initialization of inputB pointer */ 00160 pIn2 = pSrcB; 00161 00162 /* Number of output samples is calculated */ 00163 outBlockSize = (2u * srcALen) - 1u; 00164 00165 /* When srcALen > srcBLen, zero padding has to be done to srcB 00166 * to make their lengths equal. 00167 * Instead, (outBlockSize - (srcALen + srcBLen - 1)) 00168 * number of output samples are made zero */ 00169 j = outBlockSize - (srcALen + (srcBLen - 1u)); 00170 00171 /* Updating the pointer position to non zero value */ 00172 pOut += j; 00173 00174 //while(j > 0u) 00175 //{ 00176 // /* Zero is stored in the destination buffer */ 00177 // *pOut++ = 0.0f; 00178 00179 // /* Decrement the loop counter */ 00180 // j--; 00181 //} 00182 00183 } 00184 else 00185 { 00186 /* Initialization of inputA pointer */ 00187 pIn1 = pSrcB; 00188 00189 /* Initialization of inputB pointer */ 00190 pIn2 = pSrcA; 00191 00192 /* srcBLen is always considered as shorter or equal to srcALen */ 00193 j = srcBLen; 00194 srcBLen = srcALen; 00195 srcALen = j; 00196 00197 /* CORR(x, y) = Reverse order(CORR(y, x)) */ 00198 /* Hence set the destination pointer to point to the last output sample */ 00199 pOut = pDst + ((srcALen + srcBLen) - 2u); 00200 00201 /* Destination address modifier is set to -1 */ 00202 inc = -1; 00203 00204 } 00205 00206 /* The function is internally 00207 * divided into three parts according to the number of multiplications that has to be 00208 * taken place between inputA samples and inputB samples. In the first part of the 00209 * algorithm, the multiplications increase by one for every iteration. 00210 * In the second part of the algorithm, srcBLen number of multiplications are done. 00211 * In the third part of the algorithm, the multiplications decrease by one 00212 * for every iteration.*/ 00213 /* The algorithm is implemented in three stages. 00214 * The loop counters of each stage is initiated here. */ 00215 blockSize1 = srcBLen - 1u; 00216 blockSize2 = srcALen - (srcBLen - 1u); 00217 blockSize3 = blockSize1; 00218 00219 /* -------------------------- 00220 * Initializations of stage1 00221 * -------------------------*/ 00222 00223 /* sum = x[0] * y[srcBlen - 1] 00224 * sum = x[0] * y[srcBlen-2] + x[1] * y[srcBlen - 1] 00225 * .... 00226 * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1] 00227 */ 00228 00229 /* In this stage the MAC operations are increased by 1 for every iteration. 00230 The count variable holds the number of MAC operations performed */ 00231 count = 1u; 00232 00233 /* Working pointer of inputA */ 00234 px = pIn1; 00235 00236 /* Working pointer of inputB */ 00237 pSrc1 = pIn2 + (srcBLen - 1u); 00238 py = pSrc1; 00239 00240 /* ------------------------ 00241 * Stage1 process 00242 * ----------------------*/ 00243 00244 /* The first stage starts here */ 00245 while(blockSize1 > 0u) 00246 { 00247 /* Accumulator is made zero for every iteration */ 00248 sum = 0.0f; 00249 00250 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00251 k = count >> 2u; 00252 00253 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00254 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00255 while(k > 0u) 00256 { 00257 /* x[0] * y[srcBLen - 4] */ 00258 sum += *px++ * *py++; 00259 /* x[1] * y[srcBLen - 3] */ 00260 sum += *px++ * *py++; 00261 /* x[2] * y[srcBLen - 2] */ 00262 sum += *px++ * *py++; 00263 /* x[3] * y[srcBLen - 1] */ 00264 sum += *px++ * *py++; 00265 00266 /* Decrement the loop counter */ 00267 k--; 00268 } 00269 00270 /* If the count is not a multiple of 4, compute any remaining MACs here. 00271 ** No loop unrolling is used. */ 00272 k = count % 0x4u; 00273 00274 while(k > 0u) 00275 { 00276 /* Perform the multiply-accumulate */ 00277 /* x[0] * y[srcBLen - 1] */ 00278 sum += *px++ * *py++; 00279 00280 /* Decrement the loop counter */ 00281 k--; 00282 } 00283 00284 /* Store the result in the accumulator in the destination buffer. */ 00285 *pOut = sum; 00286 /* Destination pointer is updated according to the address modifier, inc */ 00287 pOut += inc; 00288 00289 /* Update the inputA and inputB pointers for next MAC calculation */ 00290 py = pSrc1 - count; 00291 px = pIn1; 00292 00293 /* Increment the MAC count */ 00294 count++; 00295 00296 /* Decrement the loop counter */ 00297 blockSize1--; 00298 } 00299 00300 /* -------------------------- 00301 * Initializations of stage2 00302 * ------------------------*/ 00303 00304 /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1] 00305 * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1] 00306 * .... 00307 * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] 00308 */ 00309 00310 /* Working pointer of inputA */ 00311 px = pIn1; 00312 00313 /* Working pointer of inputB */ 00314 py = pIn2; 00315 00316 /* count is index by which the pointer pIn1 to be incremented */ 00317 count = 0u; 00318 00319 /* ------------------- 00320 * Stage2 process 00321 * ------------------*/ 00322 00323 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00324 * So, to loop unroll over blockSize2, 00325 * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */ 00326 if(srcBLen >= 4u) 00327 { 00328 /* Loop unroll over blockSize2, by 4 */ 00329 blkCnt = blockSize2 >> 2u; 00330 00331 while(blkCnt > 0u) 00332 { 00333 /* Set all accumulators to zero */ 00334 acc0 = 0.0f; 00335 acc1 = 0.0f; 00336 acc2 = 0.0f; 00337 acc3 = 0.0f; 00338 00339 /* read x[0], x[1], x[2] samples */ 00340 x0 = *(px++); 00341 x1 = *(px++); 00342 x2 = *(px++); 00343 00344 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00345 k = srcBLen >> 2u; 00346 00347 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00348 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00349 do 00350 { 00351 /* Read y[0] sample */ 00352 c0 = *(py++); 00353 00354 /* Read x[3] sample */ 00355 x3 = *(px++); 00356 00357 /* Perform the multiply-accumulate */ 00358 /* acc0 += x[0] * y[0] */ 00359 acc0 += x0 * c0; 00360 /* acc1 += x[1] * y[0] */ 00361 acc1 += x1 * c0; 00362 /* acc2 += x[2] * y[0] */ 00363 acc2 += x2 * c0; 00364 /* acc3 += x[3] * y[0] */ 00365 acc3 += x3 * c0; 00366 00367 /* Read y[1] sample */ 00368 c0 = *(py++); 00369 00370 /* Read x[4] sample */ 00371 x0 = *(px++); 00372 00373 /* Perform the multiply-accumulate */ 00374 /* acc0 += x[1] * y[1] */ 00375 acc0 += x1 * c0; 00376 /* acc1 += x[2] * y[1] */ 00377 acc1 += x2 * c0; 00378 /* acc2 += x[3] * y[1] */ 00379 acc2 += x3 * c0; 00380 /* acc3 += x[4] * y[1] */ 00381 acc3 += x0 * c0; 00382 00383 /* Read y[2] sample */ 00384 c0 = *(py++); 00385 00386 /* Read x[5] sample */ 00387 x1 = *(px++); 00388 00389 /* Perform the multiply-accumulates */ 00390 /* acc0 += x[2] * y[2] */ 00391 acc0 += x2 * c0; 00392 /* acc1 += x[3] * y[2] */ 00393 acc1 += x3 * c0; 00394 /* acc2 += x[4] * y[2] */ 00395 acc2 += x0 * c0; 00396 /* acc3 += x[5] * y[2] */ 00397 acc3 += x1 * c0; 00398 00399 /* Read y[3] sample */ 00400 c0 = *(py++); 00401 00402 /* Read x[6] sample */ 00403 x2 = *(px++); 00404 00405 /* Perform the multiply-accumulates */ 00406 /* acc0 += x[3] * y[3] */ 00407 acc0 += x3 * c0; 00408 /* acc1 += x[4] * y[3] */ 00409 acc1 += x0 * c0; 00410 /* acc2 += x[5] * y[3] */ 00411 acc2 += x1 * c0; 00412 /* acc3 += x[6] * y[3] */ 00413 acc3 += x2 * c0; 00414 00415 00416 } while(--k); 00417 00418 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00419 ** No loop unrolling is used. */ 00420 k = srcBLen % 0x4u; 00421 00422 while(k > 0u) 00423 { 00424 /* Read y[4] sample */ 00425 c0 = *(py++); 00426 00427 /* Read x[7] sample */ 00428 x3 = *(px++); 00429 00430 /* Perform the multiply-accumulates */ 00431 /* acc0 += x[4] * y[4] */ 00432 acc0 += x0 * c0; 00433 /* acc1 += x[5] * y[4] */ 00434 acc1 += x1 * c0; 00435 /* acc2 += x[6] * y[4] */ 00436 acc2 += x2 * c0; 00437 /* acc3 += x[7] * y[4] */ 00438 acc3 += x3 * c0; 00439 00440 /* Reuse the present samples for the next MAC */ 00441 x0 = x1; 00442 x1 = x2; 00443 x2 = x3; 00444 00445 /* Decrement the loop counter */ 00446 k--; 00447 } 00448 00449 /* Store the result in the accumulator in the destination buffer. */ 00450 *pOut = acc0; 00451 /* Destination pointer is updated according to the address modifier, inc */ 00452 pOut += inc; 00453 00454 *pOut = acc1; 00455 pOut += inc; 00456 00457 *pOut = acc2; 00458 pOut += inc; 00459 00460 *pOut = acc3; 00461 pOut += inc; 00462 00463 /* Increment the pointer pIn1 index, count by 4 */ 00464 count += 4u; 00465 00466 /* Update the inputA and inputB pointers for next MAC calculation */ 00467 px = pIn1 + count; 00468 py = pIn2; 00469 00470 /* Decrement the loop counter */ 00471 blkCnt--; 00472 } 00473 00474 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00475 ** No loop unrolling is used. */ 00476 blkCnt = blockSize2 % 0x4u; 00477 00478 while(blkCnt > 0u) 00479 { 00480 /* Accumulator is made zero for every iteration */ 00481 sum = 0.0f; 00482 00483 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00484 k = srcBLen >> 2u; 00485 00486 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00487 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00488 while(k > 0u) 00489 { 00490 /* Perform the multiply-accumulates */ 00491 sum += *px++ * *py++; 00492 sum += *px++ * *py++; 00493 sum += *px++ * *py++; 00494 sum += *px++ * *py++; 00495 00496 /* Decrement the loop counter */ 00497 k--; 00498 } 00499 00500 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00501 ** No loop unrolling is used. */ 00502 k = srcBLen % 0x4u; 00503 00504 while(k > 0u) 00505 { 00506 /* Perform the multiply-accumulate */ 00507 sum += *px++ * *py++; 00508 00509 /* Decrement the loop counter */ 00510 k--; 00511 } 00512 00513 /* Store the result in the accumulator in the destination buffer. */ 00514 *pOut = sum; 00515 /* Destination pointer is updated according to the address modifier, inc */ 00516 pOut += inc; 00517 00518 /* Increment the pointer pIn1 index, count by 1 */ 00519 count++; 00520 00521 /* Update the inputA and inputB pointers for next MAC calculation */ 00522 px = pIn1 + count; 00523 py = pIn2; 00524 00525 /* Decrement the loop counter */ 00526 blkCnt--; 00527 } 00528 } 00529 else 00530 { 00531 /* If the srcBLen is not a multiple of 4, 00532 * the blockSize2 loop cannot be unrolled by 4 */ 00533 blkCnt = blockSize2; 00534 00535 while(blkCnt > 0u) 00536 { 00537 /* Accumulator is made zero for every iteration */ 00538 sum = 0.0f; 00539 00540 /* Loop over srcBLen */ 00541 k = srcBLen; 00542 00543 while(k > 0u) 00544 { 00545 /* Perform the multiply-accumulate */ 00546 sum += *px++ * *py++; 00547 00548 /* Decrement the loop counter */ 00549 k--; 00550 } 00551 00552 /* Store the result in the accumulator in the destination buffer. */ 00553 *pOut = sum; 00554 /* Destination pointer is updated according to the address modifier, inc */ 00555 pOut += inc; 00556 00557 /* Increment the pointer pIn1 index, count by 1 */ 00558 count++; 00559 00560 /* Update the inputA and inputB pointers for next MAC calculation */ 00561 px = pIn1 + count; 00562 py = pIn2; 00563 00564 /* Decrement the loop counter */ 00565 blkCnt--; 00566 } 00567 } 00568 00569 /* -------------------------- 00570 * Initializations of stage3 00571 * -------------------------*/ 00572 00573 /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] 00574 * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] 00575 * .... 00576 * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1] 00577 * sum += x[srcALen-1] * y[0] 00578 */ 00579 00580 /* In this stage the MAC operations are decreased by 1 for every iteration. 00581 The count variable holds the number of MAC operations performed */ 00582 count = srcBLen - 1u; 00583 00584 /* Working pointer of inputA */ 00585 pSrc1 = pIn1 + (srcALen - (srcBLen - 1u)); 00586 px = pSrc1; 00587 00588 /* Working pointer of inputB */ 00589 py = pIn2; 00590 00591 /* ------------------- 00592 * Stage3 process 00593 * ------------------*/ 00594 00595 while(blockSize3 > 0u) 00596 { 00597 /* Accumulator is made zero for every iteration */ 00598 sum = 0.0f; 00599 00600 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00601 k = count >> 2u; 00602 00603 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00604 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00605 while(k > 0u) 00606 { 00607 /* Perform the multiply-accumulates */ 00608 /* sum += x[srcALen - srcBLen + 4] * y[3] */ 00609 sum += *px++ * *py++; 00610 /* sum += x[srcALen - srcBLen + 3] * y[2] */ 00611 sum += *px++ * *py++; 00612 /* sum += x[srcALen - srcBLen + 2] * y[1] */ 00613 sum += *px++ * *py++; 00614 /* sum += x[srcALen - srcBLen + 1] * y[0] */ 00615 sum += *px++ * *py++; 00616 00617 /* Decrement the loop counter */ 00618 k--; 00619 } 00620 00621 /* If the count is not a multiple of 4, compute any remaining MACs here. 00622 ** No loop unrolling is used. */ 00623 k = count % 0x4u; 00624 00625 while(k > 0u) 00626 { 00627 /* Perform the multiply-accumulates */ 00628 sum += *px++ * *py++; 00629 00630 /* Decrement the loop counter */ 00631 k--; 00632 } 00633 00634 /* Store the result in the accumulator in the destination buffer. */ 00635 *pOut = sum; 00636 /* Destination pointer is updated according to the address modifier, inc */ 00637 pOut += inc; 00638 00639 /* Update the inputA and inputB pointers for next MAC calculation */ 00640 px = ++pSrc1; 00641 py = pIn2; 00642 00643 /* Decrement the MAC count */ 00644 count--; 00645 00646 /* Decrement the loop counter */ 00647 blockSize3--; 00648 } 00649 00650 #else 00651 00652 /* Run the below code for Cortex-M0 */ 00653 00654 float32_t *pIn1 = pSrcA; /* inputA pointer */ 00655 float32_t *pIn2 = pSrcB + (srcBLen - 1u); /* inputB pointer */ 00656 float32_t sum; /* Accumulator */ 00657 uint32_t i = 0u, j; /* loop counters */ 00658 uint32_t inv = 0u; /* Reverse order flag */ 00659 uint32_t tot = 0u; /* Length */ 00660 00661 /* The algorithm implementation is based on the lengths of the inputs. */ 00662 /* srcB is always made to slide across srcA. */ 00663 /* So srcBLen is always considered as shorter or equal to srcALen */ 00664 /* But CORR(x, y) is reverse of CORR(y, x) */ 00665 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ 00666 /* and a varaible, inv is set to 1 */ 00667 /* If lengths are not equal then zero pad has to be done to make the two 00668 * inputs of same length. But to improve the performance, we include zeroes 00669 * in the output instead of zero padding either of the the inputs*/ 00670 /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the 00671 * starting of the output buffer */ 00672 /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the 00673 * ending of the output buffer */ 00674 /* Once the zero padding is done the remaining of the output is calcualted 00675 * using convolution but with the shorter signal time shifted. */ 00676 00677 /* Calculate the length of the remaining sequence */ 00678 tot = ((srcALen + srcBLen) - 2u); 00679 00680 if(srcALen > srcBLen) 00681 { 00682 /* Calculating the number of zeros to be padded to the output */ 00683 j = srcALen - srcBLen; 00684 00685 /* Initialise the pointer after zero padding */ 00686 pDst += j; 00687 } 00688 00689 else if(srcALen < srcBLen) 00690 { 00691 /* Initialization to inputB pointer */ 00692 pIn1 = pSrcB; 00693 00694 /* Initialization to the end of inputA pointer */ 00695 pIn2 = pSrcA + (srcALen - 1u); 00696 00697 /* Initialisation of the pointer after zero padding */ 00698 pDst = pDst + tot; 00699 00700 /* Swapping the lengths */ 00701 j = srcALen; 00702 srcALen = srcBLen; 00703 srcBLen = j; 00704 00705 /* Setting the reverse flag */ 00706 inv = 1; 00707 00708 } 00709 00710 /* Loop to calculate convolution for output length number of times */ 00711 for (i = 0u; i <= tot; i++) 00712 { 00713 /* Initialize sum with zero to carry on MAC operations */ 00714 sum = 0.0f; 00715 00716 /* Loop to perform MAC operations according to convolution equation */ 00717 for (j = 0u; j <= i; j++) 00718 { 00719 /* Check the array limitations */ 00720 if((((i - j) < srcBLen) && (j < srcALen))) 00721 { 00722 /* z[i] += x[i-j] * y[j] */ 00723 sum += pIn1[j] * pIn2[-((int32_t) i - j)]; 00724 } 00725 } 00726 /* Store the output in the destination buffer */ 00727 if(inv == 1) 00728 *pDst-- = sum; 00729 else 00730 *pDst++ = sum; 00731 } 00732 00733 #endif /* #ifndef ARM_MATH_CM0_FAMILY */ 00734 00735 } 00736 00737 /** 00738 * @} end of Corr group 00739 */
Generated on Tue Jul 12 2022 12:36:54 by 1.7.2