Aded CMSIS5 DSP and NN folder. Needs some work

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_correlate_q15.c Source File

arm_correlate_q15.c

00001 /* ----------------------------------------------------------------------
00002  * Project:      CMSIS DSP Library
00003  * Title:        arm_correlate_q15.c
00004  * Description:  Correlation of Q15 sequences
00005  *
00006  * $Date:        27. January 2017
00007  * $Revision:    V.1.5.1
00008  *
00009  * Target Processor: Cortex-M cores
00010  * -------------------------------------------------------------------- */
00011 /*
00012  * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
00013  *
00014  * SPDX-License-Identifier: Apache-2.0
00015  *
00016  * Licensed under the Apache License, Version 2.0 (the License); you may
00017  * not use this file except in compliance with the License.
00018  * You may obtain a copy of the License at
00019  *
00020  * www.apache.org/licenses/LICENSE-2.0
00021  *
00022  * Unless required by applicable law or agreed to in writing, software
00023  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
00024  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00025  * See the License for the specific language governing permissions and
00026  * limitations under the License.
00027  */
00028 
00029 #include "arm_math.h"
00030 
00031 /**
00032  * @ingroup groupFilters
00033  */
00034 
00035 /**
00036  * @addtogroup Corr
00037  * @{
00038  */
00039 
00040 /**
00041  * @brief Correlation of Q15 sequences.
00042  * @param[in] *pSrcA points to the first input sequence.
00043  * @param[in] srcALen length of the first input sequence.
00044  * @param[in] *pSrcB points to the second input sequence.
00045  * @param[in] srcBLen length of the second input sequence.
00046  * @param[out] *pDst points to the location where the output result is written.  Length 2 * max(srcALen, srcBLen) - 1.
00047  * @return none.
00048  *
00049  * @details
00050  * <b>Scaling and Overflow Behavior:</b>
00051  *
00052  * \par
00053  * The function is implemented using a 64-bit internal accumulator.
00054  * Both inputs are in 1.15 format and multiplications yield a 2.30 result.
00055  * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
00056  * This approach provides 33 guard bits and there is no risk of overflow.
00057  * The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.
00058  *
00059  * \par
00060  * Refer to <code>arm_correlate_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.
00061  *
00062  * \par
00063  * Refer the function <code>arm_correlate_opt_q15()</code> for a faster implementation of this function using scratch buffers.
00064  *
00065  */
00066 
00067 void arm_correlate_q15(
00068   q15_t * pSrcA,
00069   uint32_t srcALen,
00070   q15_t * pSrcB,
00071   uint32_t srcBLen,
00072   q15_t * pDst)
00073 {
00074 
00075 #if (defined(ARM_MATH_CM7) || defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE)
00076 
00077   /* Run the below code for Cortex-M4 and Cortex-M3 */
00078 
00079   q15_t *pIn1;                                   /* inputA pointer               */
00080   q15_t *pIn2;                                   /* inputB pointer               */
00081   q15_t *pOut = pDst;                            /* output pointer               */
00082   q63_t sum, acc0, acc1, acc2, acc3;             /* Accumulators                  */
00083   q15_t *px;                                     /* Intermediate inputA pointer  */
00084   q15_t *py;                                     /* Intermediate inputB pointer  */
00085   q15_t *pSrc1;                                  /* Intermediate pointers        */
00086   q31_t x0, x1, x2, x3, c0;                      /* temporary variables for holding input and coefficient values */
00087   uint32_t j, k = 0U, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3;  /* loop counter                 */
00088   int32_t inc = 1;                               /* Destination address modifier */
00089 
00090 
00091   /* The algorithm implementation is based on the lengths of the inputs. */
00092   /* srcB is always made to slide across srcA. */
00093   /* So srcBLen is always considered as shorter or equal to srcALen */
00094   /* But CORR(x, y) is reverse of CORR(y, x) */
00095   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
00096   /* and the destination pointer modifier, inc is set to -1 */
00097   /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
00098   /* But to improve the performance,
00099    * we include zeroes in the output instead of zero padding either of the the inputs*/
00100   /* If srcALen > srcBLen,
00101    * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
00102   /* If srcALen < srcBLen,
00103    * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
00104   if (srcALen >= srcBLen)
00105   {
00106     /* Initialization of inputA pointer */
00107     pIn1 = (pSrcA);
00108 
00109     /* Initialization of inputB pointer */
00110     pIn2 = (pSrcB);
00111 
00112     /* Number of output samples is calculated */
00113     outBlockSize = (2U * srcALen) - 1U;
00114 
00115     /* When srcALen > srcBLen, zero padding is done to srcB
00116      * to make their lengths equal.
00117      * Instead, (outBlockSize - (srcALen + srcBLen - 1))
00118      * number of output samples are made zero */
00119     j = outBlockSize - (srcALen + (srcBLen - 1U));
00120 
00121     /* Updating the pointer position to non zero value */
00122     pOut += j;
00123 
00124   }
00125   else
00126   {
00127     /* Initialization of inputA pointer */
00128     pIn1 = (pSrcB);
00129 
00130     /* Initialization of inputB pointer */
00131     pIn2 = (pSrcA);
00132 
00133     /* srcBLen is always considered as shorter or equal to srcALen */
00134     j = srcBLen;
00135     srcBLen = srcALen;
00136     srcALen = j;
00137 
00138     /* CORR(x, y) = Reverse order(CORR(y, x)) */
00139     /* Hence set the destination pointer to point to the last output sample */
00140     pOut = pDst + ((srcALen + srcBLen) - 2U);
00141 
00142     /* Destination address modifier is set to -1 */
00143     inc = -1;
00144 
00145   }
00146 
00147   /* The function is internally
00148    * divided into three parts according to the number of multiplications that has to be
00149    * taken place between inputA samples and inputB samples. In the first part of the
00150    * algorithm, the multiplications increase by one for every iteration.
00151    * In the second part of the algorithm, srcBLen number of multiplications are done.
00152    * In the third part of the algorithm, the multiplications decrease by one
00153    * for every iteration.*/
00154   /* The algorithm is implemented in three stages.
00155    * The loop counters of each stage is initiated here. */
00156   blockSize1 = srcBLen - 1U;
00157   blockSize2 = srcALen - (srcBLen - 1U);
00158   blockSize3 = blockSize1;
00159 
00160   /* --------------------------
00161    * Initializations of stage1
00162    * -------------------------*/
00163 
00164   /* sum = x[0] * y[srcBlen - 1]
00165    * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]
00166    * ....
00167    * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
00168    */
00169 
00170   /* In this stage the MAC operations are increased by 1 for every iteration.
00171      The count variable holds the number of MAC operations performed */
00172   count = 1U;
00173 
00174   /* Working pointer of inputA */
00175   px = pIn1;
00176 
00177   /* Working pointer of inputB */
00178   pSrc1 = pIn2 + (srcBLen - 1U);
00179   py = pSrc1;
00180 
00181   /* ------------------------
00182    * Stage1 process
00183    * ----------------------*/
00184 
00185   /* The first loop starts here */
00186   while (blockSize1 > 0U)
00187   {
00188     /* Accumulator is made zero for every iteration */
00189     sum = 0;
00190 
00191     /* Apply loop unrolling and compute 4 MACs simultaneously. */
00192     k = count >> 2;
00193 
00194     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00195      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00196     while (k > 0U)
00197     {
00198       /* x[0] * y[srcBLen - 4] , x[1] * y[srcBLen - 3] */
00199       sum = __SMLALD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
00200       /* x[3] * y[srcBLen - 1] , x[2] * y[srcBLen - 2] */
00201       sum = __SMLALD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
00202 
00203       /* Decrement the loop counter */
00204       k--;
00205     }
00206 
00207     /* If the count is not a multiple of 4, compute any remaining MACs here.
00208      ** No loop unrolling is used. */
00209     k = count % 0x4U;
00210 
00211     while (k > 0U)
00212     {
00213       /* Perform the multiply-accumulates */
00214       /* x[0] * y[srcBLen - 1] */
00215       sum = __SMLALD(*px++, *py++, sum);
00216 
00217       /* Decrement the loop counter */
00218       k--;
00219     }
00220 
00221     /* Store the result in the accumulator in the destination buffer. */
00222     *pOut = (q15_t) (__SSAT((sum >> 15), 16));
00223     /* Destination pointer is updated according to the address modifier, inc */
00224     pOut += inc;
00225 
00226     /* Update the inputA and inputB pointers for next MAC calculation */
00227     py = pSrc1 - count;
00228     px = pIn1;
00229 
00230     /* Increment the MAC count */
00231     count++;
00232 
00233     /* Decrement the loop counter */
00234     blockSize1--;
00235   }
00236 
00237   /* --------------------------
00238    * Initializations of stage2
00239    * ------------------------*/
00240 
00241   /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
00242    * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
00243    * ....
00244    * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
00245    */
00246 
00247   /* Working pointer of inputA */
00248   px = pIn1;
00249 
00250   /* Working pointer of inputB */
00251   py = pIn2;
00252 
00253   /* count is index by which the pointer pIn1 to be incremented */
00254   count = 0U;
00255 
00256   /* -------------------
00257    * Stage2 process
00258    * ------------------*/
00259 
00260   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
00261    * So, to loop unroll over blockSize2,
00262    * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */
00263   if (srcBLen >= 4U)
00264   {
00265     /* Loop unroll over blockSize2, by 4 */
00266     blkCnt = blockSize2 >> 2U;
00267 
00268     while (blkCnt > 0U)
00269     {
00270       /* Set all accumulators to zero */
00271       acc0 = 0;
00272       acc1 = 0;
00273       acc2 = 0;
00274       acc3 = 0;
00275 
00276       /* read x[0], x[1] samples */
00277       x0 = *__SIMD32(px);
00278       /* read x[1], x[2] samples */
00279       x1 = _SIMD32_OFFSET(px + 1);
00280       px += 2U;
00281 
00282       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00283       k = srcBLen >> 2U;
00284 
00285       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00286        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00287       do
00288       {
00289         /* Read the first two inputB samples using SIMD:
00290          * y[0] and y[1] */
00291         c0 = *__SIMD32(py)++;
00292 
00293         /* acc0 +=  x[0] * y[0] + x[1] * y[1] */
00294         acc0 = __SMLALD(x0, c0, acc0);
00295 
00296         /* acc1 +=  x[1] * y[0] + x[2] * y[1] */
00297         acc1 = __SMLALD(x1, c0, acc1);
00298 
00299         /* Read x[2], x[3] */
00300         x2 = *__SIMD32(px);
00301 
00302         /* Read x[3], x[4] */
00303         x3 = _SIMD32_OFFSET(px + 1);
00304 
00305         /* acc2 +=  x[2] * y[0] + x[3] * y[1] */
00306         acc2 = __SMLALD(x2, c0, acc2);
00307 
00308         /* acc3 +=  x[3] * y[0] + x[4] * y[1] */
00309         acc3 = __SMLALD(x3, c0, acc3);
00310 
00311         /* Read y[2] and y[3] */
00312         c0 = *__SIMD32(py)++;
00313 
00314         /* acc0 +=  x[2] * y[2] + x[3] * y[3] */
00315         acc0 = __SMLALD(x2, c0, acc0);
00316 
00317         /* acc1 +=  x[3] * y[2] + x[4] * y[3] */
00318         acc1 = __SMLALD(x3, c0, acc1);
00319 
00320         /* Read x[4], x[5] */
00321         x0 = _SIMD32_OFFSET(px + 2);
00322 
00323         /* Read x[5], x[6] */
00324         x1 = _SIMD32_OFFSET(px + 3);
00325 
00326         px += 4U;
00327 
00328         /* acc2 +=  x[4] * y[2] + x[5] * y[3] */
00329         acc2 = __SMLALD(x0, c0, acc2);
00330 
00331         /* acc3 +=  x[5] * y[2] + x[6] * y[3] */
00332         acc3 = __SMLALD(x1, c0, acc3);
00333 
00334       } while (--k);
00335 
00336       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
00337        ** No loop unrolling is used. */
00338       k = srcBLen % 0x4U;
00339 
00340       if (k == 1U)
00341       {
00342         /* Read y[4] */
00343         c0 = *py;
00344 #ifdef  ARM_MATH_BIG_ENDIAN
00345 
00346         c0 = c0 << 16U;
00347 
00348 #else
00349 
00350         c0 = c0 & 0x0000FFFF;
00351 
00352 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
00353         /* Read x[7] */
00354         x3 = *__SIMD32(px);
00355         px++;
00356 
00357         /* Perform the multiply-accumulates */
00358         acc0 = __SMLALD(x0, c0, acc0);
00359         acc1 = __SMLALD(x1, c0, acc1);
00360         acc2 = __SMLALDX(x1, c0, acc2);
00361         acc3 = __SMLALDX(x3, c0, acc3);
00362       }
00363 
00364       if (k == 2U)
00365       {
00366         /* Read y[4], y[5] */
00367         c0 = *__SIMD32(py);
00368 
00369         /* Read x[7], x[8] */
00370         x3 = *__SIMD32(px);
00371 
00372         /* Read x[9] */
00373         x2 = _SIMD32_OFFSET(px + 1);
00374         px += 2U;
00375 
00376         /* Perform the multiply-accumulates */
00377         acc0 = __SMLALD(x0, c0, acc0);
00378         acc1 = __SMLALD(x1, c0, acc1);
00379         acc2 = __SMLALD(x3, c0, acc2);
00380         acc3 = __SMLALD(x2, c0, acc3);
00381       }
00382 
00383       if (k == 3U)
00384       {
00385         /* Read y[4], y[5] */
00386         c0 = *__SIMD32(py)++;
00387 
00388         /* Read x[7], x[8] */
00389         x3 = *__SIMD32(px);
00390 
00391         /* Read x[9] */
00392         x2 = _SIMD32_OFFSET(px + 1);
00393 
00394         /* Perform the multiply-accumulates */
00395         acc0 = __SMLALD(x0, c0, acc0);
00396         acc1 = __SMLALD(x1, c0, acc1);
00397         acc2 = __SMLALD(x3, c0, acc2);
00398         acc3 = __SMLALD(x2, c0, acc3);
00399 
00400         c0 = (*py);
00401 
00402         /* Read y[6] */
00403 #ifdef  ARM_MATH_BIG_ENDIAN
00404 
00405         c0 = c0 << 16U;
00406 #else
00407 
00408         c0 = c0 & 0x0000FFFF;
00409 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
00410         /* Read x[10] */
00411         x3 = _SIMD32_OFFSET(px + 2);
00412         px += 3U;
00413 
00414         /* Perform the multiply-accumulates */
00415         acc0 = __SMLALDX(x1, c0, acc0);
00416         acc1 = __SMLALD(x2, c0, acc1);
00417         acc2 = __SMLALDX(x2, c0, acc2);
00418         acc3 = __SMLALDX(x3, c0, acc3);
00419       }
00420 
00421       /* Store the result in the accumulator in the destination buffer. */
00422       *pOut = (q15_t) (__SSAT(acc0 >> 15, 16));
00423       /* Destination pointer is updated according to the address modifier, inc */
00424       pOut += inc;
00425 
00426       *pOut = (q15_t) (__SSAT(acc1 >> 15, 16));
00427       pOut += inc;
00428 
00429       *pOut = (q15_t) (__SSAT(acc2 >> 15, 16));
00430       pOut += inc;
00431 
00432       *pOut = (q15_t) (__SSAT(acc3 >> 15, 16));
00433       pOut += inc;
00434 
00435       /* Increment the count by 4 as 4 output values are computed */
00436       count += 4U;
00437 
00438       /* Update the inputA and inputB pointers for next MAC calculation */
00439       px = pIn1 + count;
00440       py = pIn2;
00441 
00442       /* Decrement the loop counter */
00443       blkCnt--;
00444     }
00445 
00446     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
00447      ** No loop unrolling is used. */
00448     blkCnt = blockSize2 % 0x4U;
00449 
00450     while (blkCnt > 0U)
00451     {
00452       /* Accumulator is made zero for every iteration */
00453       sum = 0;
00454 
00455       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00456       k = srcBLen >> 2U;
00457 
00458       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00459        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00460       while (k > 0U)
00461       {
00462         /* Perform the multiply-accumulates */
00463         sum += ((q63_t) * px++ * *py++);
00464         sum += ((q63_t) * px++ * *py++);
00465         sum += ((q63_t) * px++ * *py++);
00466         sum += ((q63_t) * px++ * *py++);
00467 
00468         /* Decrement the loop counter */
00469         k--;
00470       }
00471 
00472       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
00473        ** No loop unrolling is used. */
00474       k = srcBLen % 0x4U;
00475 
00476       while (k > 0U)
00477       {
00478         /* Perform the multiply-accumulates */
00479         sum += ((q63_t) * px++ * *py++);
00480 
00481         /* Decrement the loop counter */
00482         k--;
00483       }
00484 
00485       /* Store the result in the accumulator in the destination buffer. */
00486       *pOut = (q15_t) (__SSAT(sum >> 15, 16));
00487       /* Destination pointer is updated according to the address modifier, inc */
00488       pOut += inc;
00489 
00490       /* Increment count by 1, as one output value is computed */
00491       count++;
00492 
00493       /* Update the inputA and inputB pointers for next MAC calculation */
00494       px = pIn1 + count;
00495       py = pIn2;
00496 
00497       /* Decrement the loop counter */
00498       blkCnt--;
00499     }
00500   }
00501   else
00502   {
00503     /* If the srcBLen is not a multiple of 4,
00504      * the blockSize2 loop cannot be unrolled by 4 */
00505     blkCnt = blockSize2;
00506 
00507     while (blkCnt > 0U)
00508     {
00509       /* Accumulator is made zero for every iteration */
00510       sum = 0;
00511 
00512       /* Loop over srcBLen */
00513       k = srcBLen;
00514 
00515       while (k > 0U)
00516       {
00517         /* Perform the multiply-accumulate */
00518         sum += ((q63_t) * px++ * *py++);
00519 
00520         /* Decrement the loop counter */
00521         k--;
00522       }
00523 
00524       /* Store the result in the accumulator in the destination buffer. */
00525       *pOut = (q15_t) (__SSAT(sum >> 15, 16));
00526       /* Destination pointer is updated according to the address modifier, inc */
00527       pOut += inc;
00528 
00529       /* Increment the MAC count */
00530       count++;
00531 
00532       /* Update the inputA and inputB pointers for next MAC calculation */
00533       px = pIn1 + count;
00534       py = pIn2;
00535 
00536       /* Decrement the loop counter */
00537       blkCnt--;
00538     }
00539   }
00540 
00541   /* --------------------------
00542    * Initializations of stage3
00543    * -------------------------*/
00544 
00545   /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
00546    * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
00547    * ....
00548    * sum +=  x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
00549    * sum +=  x[srcALen-1] * y[0]
00550    */
00551 
00552   /* In this stage the MAC operations are decreased by 1 for every iteration.
00553      The count variable holds the number of MAC operations performed */
00554   count = srcBLen - 1U;
00555 
00556   /* Working pointer of inputA */
00557   pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
00558   px = pSrc1;
00559 
00560   /* Working pointer of inputB */
00561   py = pIn2;
00562 
00563   /* -------------------
00564    * Stage3 process
00565    * ------------------*/
00566 
00567   while (blockSize3 > 0U)
00568   {
00569     /* Accumulator is made zero for every iteration */
00570     sum = 0;
00571 
00572     /* Apply loop unrolling and compute 4 MACs simultaneously. */
00573     k = count >> 2U;
00574 
00575     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00576      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00577     while (k > 0U)
00578     {
00579       /* Perform the multiply-accumulates */
00580       /* sum += x[srcALen - srcBLen + 4] * y[3] , sum += x[srcALen - srcBLen + 3] * y[2] */
00581       sum = __SMLALD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
00582       /* sum += x[srcALen - srcBLen + 2] * y[1] , sum += x[srcALen - srcBLen + 1] * y[0] */
00583       sum = __SMLALD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
00584 
00585       /* Decrement the loop counter */
00586       k--;
00587     }
00588 
00589     /* If the count is not a multiple of 4, compute any remaining MACs here.
00590      ** No loop unrolling is used. */
00591     k = count % 0x4U;
00592 
00593     while (k > 0U)
00594     {
00595       /* Perform the multiply-accumulates */
00596       sum = __SMLALD(*px++, *py++, sum);
00597 
00598       /* Decrement the loop counter */
00599       k--;
00600     }
00601 
00602     /* Store the result in the accumulator in the destination buffer. */
00603     *pOut = (q15_t) (__SSAT((sum >> 15), 16));
00604     /* Destination pointer is updated according to the address modifier, inc */
00605     pOut += inc;
00606 
00607     /* Update the inputA and inputB pointers for next MAC calculation */
00608     px = ++pSrc1;
00609     py = pIn2;
00610 
00611     /* Decrement the MAC count */
00612     count--;
00613 
00614     /* Decrement the loop counter */
00615     blockSize3--;
00616   }
00617 
00618 #else
00619 
00620 /* Run the below code for Cortex-M0 */
00621 
00622   q15_t *pIn1 = pSrcA;                           /* inputA pointer               */
00623   q15_t *pIn2 = pSrcB + (srcBLen - 1U);          /* inputB pointer               */
00624   q63_t sum;                                     /* Accumulators                  */
00625   uint32_t i = 0U, j;                            /* loop counters */
00626   uint32_t inv = 0U;                             /* Reverse order flag */
00627   uint32_t tot = 0U;                             /* Length */
00628 
00629   /* The algorithm implementation is based on the lengths of the inputs. */
00630   /* srcB is always made to slide across srcA. */
00631   /* So srcBLen is always considered as shorter or equal to srcALen */
00632   /* But CORR(x, y) is reverse of CORR(y, x) */
00633   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
00634   /* and a varaible, inv is set to 1 */
00635   /* If lengths are not equal then zero pad has to be done to  make the two
00636    * inputs of same length. But to improve the performance, we include zeroes
00637    * in the output instead of zero padding either of the the inputs*/
00638   /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the
00639    * starting of the output buffer */
00640   /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the
00641    * ending of the output buffer */
00642   /* Once the zero padding is done the remaining of the output is calcualted
00643    * using convolution but with the shorter signal time shifted. */
00644 
00645   /* Calculate the length of the remaining sequence */
00646   tot = ((srcALen + srcBLen) - 2U);
00647 
00648   if (srcALen > srcBLen)
00649   {
00650     /* Calculating the number of zeros to be padded to the output */
00651     j = srcALen - srcBLen;
00652 
00653     /* Initialise the pointer after zero padding */
00654     pDst += j;
00655   }
00656 
00657   else if (srcALen < srcBLen)
00658   {
00659     /* Initialization to inputB pointer */
00660     pIn1 = pSrcB;
00661 
00662     /* Initialization to the end of inputA pointer */
00663     pIn2 = pSrcA + (srcALen - 1U);
00664 
00665     /* Initialisation of the pointer after zero padding */
00666     pDst = pDst + tot;
00667 
00668     /* Swapping the lengths */
00669     j = srcALen;
00670     srcALen = srcBLen;
00671     srcBLen = j;
00672 
00673     /* Setting the reverse flag */
00674     inv = 1;
00675 
00676   }
00677 
00678   /* Loop to calculate convolution for output length number of times */
00679   for (i = 0U; i <= tot; i++)
00680   {
00681     /* Initialize sum with zero to carry on MAC operations */
00682     sum = 0;
00683 
00684     /* Loop to perform MAC operations according to convolution equation */
00685     for (j = 0U; j <= i; j++)
00686     {
00687       /* Check the array limitations */
00688       if ((((i - j) < srcBLen) && (j < srcALen)))
00689       {
00690         /* z[i] += x[i-j] * y[j] */
00691         sum += ((q31_t) pIn1[j] * pIn2[-((int32_t) i - j)]);
00692       }
00693     }
00694     /* Store the output in the destination buffer */
00695     if (inv == 1)
00696       *pDst-- = (q15_t) __SSAT((sum >> 15U), 16U);
00697     else
00698       *pDst++ = (q15_t) __SSAT((sum >> 15U), 16U);
00699   }
00700 
00701 #endif /* #if (defined(ARM_MATH_CM7) || defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE) */
00702 
00703 }
00704 
00705 /**
00706  * @} end of Corr group
00707  */
00708