Aded CMSIS5 DSP and NN folder. Needs some work

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_correlate_q7.c Source File

arm_correlate_q7.c

00001 /* ----------------------------------------------------------------------
00002  * Project:      CMSIS DSP Library
00003  * Title:        arm_correlate_q7.c
00004  * Description:  Correlation of Q7 sequences
00005  *
00006  * $Date:        27. January 2017
00007  * $Revision:    V.1.5.1
00008  *
00009  * Target Processor: Cortex-M cores
00010  * -------------------------------------------------------------------- */
00011 /*
00012  * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
00013  *
00014  * SPDX-License-Identifier: Apache-2.0
00015  *
00016  * Licensed under the Apache License, Version 2.0 (the License); you may
00017  * not use this file except in compliance with the License.
00018  * You may obtain a copy of the License at
00019  *
00020  * www.apache.org/licenses/LICENSE-2.0
00021  *
00022  * Unless required by applicable law or agreed to in writing, software
00023  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
00024  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00025  * See the License for the specific language governing permissions and
00026  * limitations under the License.
00027  */
00028 
00029 #include "arm_math.h"
00030 
00031 /**
00032  * @ingroup groupFilters
00033  */
00034 
00035 /**
00036  * @addtogroup Corr
00037  * @{
00038  */
00039 
00040 /**
00041  * @brief Correlation of Q7 sequences.
00042  * @param[in] *pSrcA points to the first input sequence.
00043  * @param[in] srcALen length of the first input sequence.
00044  * @param[in] *pSrcB points to the second input sequence.
00045  * @param[in] srcBLen length of the second input sequence.
00046  * @param[out] *pDst points to the location where the output result is written.  Length 2 * max(srcALen, srcBLen) - 1.
00047  * @return none.
00048  *
00049  * @details
00050  * <b>Scaling and Overflow Behavior:</b>
00051  *
00052  * \par
00053  * The function is implemented using a 32-bit internal accumulator.
00054  * Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result.
00055  * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
00056  * This approach provides 17 guard bits and there is no risk of overflow as long as <code>max(srcALen, srcBLen)<131072</code>.
00057  * The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and saturated to 1.7 format.
00058  *
00059  * \par
00060  * Refer the function <code>arm_correlate_opt_q7()</code> for a faster implementation of this function.
00061  *
00062  */
00063 
00064 void arm_correlate_q7(
00065   q7_t * pSrcA,
00066   uint32_t srcALen,
00067   q7_t * pSrcB,
00068   uint32_t srcBLen,
00069   q7_t * pDst)
00070 {
00071 
00072 
00073 #if defined (ARM_MATH_DSP)
00074 
00075   /* Run the below code for Cortex-M4 and Cortex-M3 */
00076 
00077   q7_t *pIn1;                                    /* inputA pointer               */
00078   q7_t *pIn2;                                    /* inputB pointer               */
00079   q7_t *pOut = pDst;                             /* output pointer               */
00080   q7_t *px;                                      /* Intermediate inputA pointer  */
00081   q7_t *py;                                      /* Intermediate inputB pointer  */
00082   q7_t *pSrc1;                                   /* Intermediate pointers        */
00083   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulators                  */
00084   q31_t input1, input2;                          /* temporary variables */
00085   q15_t in1, in2;                                /* temporary variables */
00086   q7_t x0, x1, x2, x3, c0, c1;                   /* temporary variables for holding input and coefficient values */
00087   uint32_t j, k = 0U, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3;  /* loop counter                 */
00088   int32_t inc = 1;
00089 
00090 
00091   /* The algorithm implementation is based on the lengths of the inputs. */
00092   /* srcB is always made to slide across srcA. */
00093   /* So srcBLen is always considered as shorter or equal to srcALen */
00094   /* But CORR(x, y) is reverse of CORR(y, x) */
00095   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
00096   /* and the destination pointer modifier, inc is set to -1 */
00097   /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
00098   /* But to improve the performance,
00099    * we include zeroes in the output instead of zero padding either of the the inputs*/
00100   /* If srcALen > srcBLen,
00101    * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
00102   /* If srcALen < srcBLen,
00103    * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
00104   if (srcALen >= srcBLen)
00105   {
00106     /* Initialization of inputA pointer */
00107     pIn1 = (pSrcA);
00108 
00109     /* Initialization of inputB pointer */
00110     pIn2 = (pSrcB);
00111 
00112     /* Number of output samples is calculated */
00113     outBlockSize = (2U * srcALen) - 1U;
00114 
00115     /* When srcALen > srcBLen, zero padding is done to srcB
00116      * to make their lengths equal.
00117      * Instead, (outBlockSize - (srcALen + srcBLen - 1))
00118      * number of output samples are made zero */
00119     j = outBlockSize - (srcALen + (srcBLen - 1U));
00120 
00121     /* Updating the pointer position to non zero value */
00122     pOut += j;
00123 
00124   }
00125   else
00126   {
00127     /* Initialization of inputA pointer */
00128     pIn1 = (pSrcB);
00129 
00130     /* Initialization of inputB pointer */
00131     pIn2 = (pSrcA);
00132 
00133     /* srcBLen is always considered as shorter or equal to srcALen */
00134     j = srcBLen;
00135     srcBLen = srcALen;
00136     srcALen = j;
00137 
00138     /* CORR(x, y) = Reverse order(CORR(y, x)) */
00139     /* Hence set the destination pointer to point to the last output sample */
00140     pOut = pDst + ((srcALen + srcBLen) - 2U);
00141 
00142     /* Destination address modifier is set to -1 */
00143     inc = -1;
00144 
00145   }
00146 
00147   /* The function is internally
00148    * divided into three parts according to the number of multiplications that has to be
00149    * taken place between inputA samples and inputB samples. In the first part of the
00150    * algorithm, the multiplications increase by one for every iteration.
00151    * In the second part of the algorithm, srcBLen number of multiplications are done.
00152    * In the third part of the algorithm, the multiplications decrease by one
00153    * for every iteration.*/
00154   /* The algorithm is implemented in three stages.
00155    * The loop counters of each stage is initiated here. */
00156   blockSize1 = srcBLen - 1U;
00157   blockSize2 = srcALen - (srcBLen - 1U);
00158   blockSize3 = blockSize1;
00159 
00160   /* --------------------------
00161    * Initializations of stage1
00162    * -------------------------*/
00163 
00164   /* sum = x[0] * y[srcBlen - 1]
00165    * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]
00166    * ....
00167    * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
00168    */
00169 
00170   /* In this stage the MAC operations are increased by 1 for every iteration.
00171      The count variable holds the number of MAC operations performed */
00172   count = 1U;
00173 
00174   /* Working pointer of inputA */
00175   px = pIn1;
00176 
00177   /* Working pointer of inputB */
00178   pSrc1 = pIn2 + (srcBLen - 1U);
00179   py = pSrc1;
00180 
00181   /* ------------------------
00182    * Stage1 process
00183    * ----------------------*/
00184 
00185   /* The first stage starts here */
00186   while (blockSize1 > 0U)
00187   {
00188     /* Accumulator is made zero for every iteration */
00189     sum = 0;
00190 
00191     /* Apply loop unrolling and compute 4 MACs simultaneously. */
00192     k = count >> 2;
00193 
00194     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00195      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00196     while (k > 0U)
00197     {
00198       /* x[0] , x[1] */
00199       in1 = (q15_t) * px++;
00200       in2 = (q15_t) * px++;
00201       input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00202 
00203       /* y[srcBLen - 4] , y[srcBLen - 3] */
00204       in1 = (q15_t) * py++;
00205       in2 = (q15_t) * py++;
00206       input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00207 
00208       /* x[0] * y[srcBLen - 4] */
00209       /* x[1] * y[srcBLen - 3] */
00210       sum = __SMLAD(input1, input2, sum);
00211 
00212       /* x[2] , x[3] */
00213       in1 = (q15_t) * px++;
00214       in2 = (q15_t) * px++;
00215       input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00216 
00217       /* y[srcBLen - 2] , y[srcBLen - 1] */
00218       in1 = (q15_t) * py++;
00219       in2 = (q15_t) * py++;
00220       input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00221 
00222       /* x[2] * y[srcBLen - 2] */
00223       /* x[3] * y[srcBLen - 1] */
00224       sum = __SMLAD(input1, input2, sum);
00225 
00226 
00227       /* Decrement the loop counter */
00228       k--;
00229     }
00230 
00231     /* If the count is not a multiple of 4, compute any remaining MACs here.
00232      ** No loop unrolling is used. */
00233     k = count % 0x4U;
00234 
00235     while (k > 0U)
00236     {
00237       /* Perform the multiply-accumulates */
00238       /* x[0] * y[srcBLen - 1] */
00239       sum += (q31_t) ((q15_t) * px++ * *py++);
00240 
00241       /* Decrement the loop counter */
00242       k--;
00243     }
00244 
00245     /* Store the result in the accumulator in the destination buffer. */
00246     *pOut = (q7_t) (__SSAT(sum >> 7, 8));
00247     /* Destination pointer is updated according to the address modifier, inc */
00248     pOut += inc;
00249 
00250     /* Update the inputA and inputB pointers for next MAC calculation */
00251     py = pSrc1 - count;
00252     px = pIn1;
00253 
00254     /* Increment the MAC count */
00255     count++;
00256 
00257     /* Decrement the loop counter */
00258     blockSize1--;
00259   }
00260 
00261   /* --------------------------
00262    * Initializations of stage2
00263    * ------------------------*/
00264 
00265   /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
00266    * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
00267    * ....
00268    * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
00269    */
00270 
00271   /* Working pointer of inputA */
00272   px = pIn1;
00273 
00274   /* Working pointer of inputB */
00275   py = pIn2;
00276 
00277   /* count is index by which the pointer pIn1 to be incremented */
00278   count = 0U;
00279 
00280   /* -------------------
00281    * Stage2 process
00282    * ------------------*/
00283 
00284   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
00285    * So, to loop unroll over blockSize2,
00286    * srcBLen should be greater than or equal to 4 */
00287   if (srcBLen >= 4U)
00288   {
00289     /* Loop unroll over blockSize2, by 4 */
00290     blkCnt = blockSize2 >> 2U;
00291 
00292     while (blkCnt > 0U)
00293     {
00294       /* Set all accumulators to zero */
00295       acc0 = 0;
00296       acc1 = 0;
00297       acc2 = 0;
00298       acc3 = 0;
00299 
00300       /* read x[0], x[1], x[2] samples */
00301       x0 = *px++;
00302       x1 = *px++;
00303       x2 = *px++;
00304 
00305       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00306       k = srcBLen >> 2U;
00307 
00308       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00309        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00310       do
00311       {
00312         /* Read y[0] sample */
00313         c0 = *py++;
00314         /* Read y[1] sample */
00315         c1 = *py++;
00316 
00317         /* Read x[3] sample */
00318         x3 = *px++;
00319 
00320         /* x[0] and x[1] are packed */
00321         in1 = (q15_t) x0;
00322         in2 = (q15_t) x1;
00323 
00324         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00325 
00326         /* y[0] and y[1] are packed */
00327         in1 = (q15_t) c0;
00328         in2 = (q15_t) c1;
00329 
00330         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00331 
00332         /* acc0 += x[0] * y[0] + x[1] * y[1]  */
00333         acc0 = __SMLAD(input1, input2, acc0);
00334 
00335         /* x[1] and x[2] are packed */
00336         in1 = (q15_t) x1;
00337         in2 = (q15_t) x2;
00338 
00339         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00340 
00341         /* acc1 += x[1] * y[0] + x[2] * y[1] */
00342         acc1 = __SMLAD(input1, input2, acc1);
00343 
00344         /* x[2] and x[3] are packed */
00345         in1 = (q15_t) x2;
00346         in2 = (q15_t) x3;
00347 
00348         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00349 
00350         /* acc2 += x[2] * y[0] + x[3] * y[1]  */
00351         acc2 = __SMLAD(input1, input2, acc2);
00352 
00353         /* Read x[4] sample */
00354         x0 = *(px++);
00355 
00356         /* x[3] and x[4] are packed */
00357         in1 = (q15_t) x3;
00358         in2 = (q15_t) x0;
00359 
00360         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00361 
00362         /* acc3 += x[3] * y[0] + x[4] * y[1]  */
00363         acc3 = __SMLAD(input1, input2, acc3);
00364 
00365         /* Read y[2] sample */
00366         c0 = *py++;
00367         /* Read y[3] sample */
00368         c1 = *py++;
00369 
00370         /* Read x[5] sample */
00371         x1 = *px++;
00372 
00373         /* x[2] and x[3] are packed */
00374         in1 = (q15_t) x2;
00375         in2 = (q15_t) x3;
00376 
00377         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00378 
00379         /* y[2] and y[3] are packed */
00380         in1 = (q15_t) c0;
00381         in2 = (q15_t) c1;
00382 
00383         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00384 
00385         /* acc0 += x[2] * y[2] + x[3] * y[3]  */
00386         acc0 = __SMLAD(input1, input2, acc0);
00387 
00388         /* x[3] and x[4] are packed */
00389         in1 = (q15_t) x3;
00390         in2 = (q15_t) x0;
00391 
00392         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00393 
00394         /* acc1 += x[3] * y[2] + x[4] * y[3]  */
00395         acc1 = __SMLAD(input1, input2, acc1);
00396 
00397         /* x[4] and x[5] are packed */
00398         in1 = (q15_t) x0;
00399         in2 = (q15_t) x1;
00400 
00401         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00402 
00403         /* acc2 += x[4] * y[2] + x[5] * y[3]  */
00404         acc2 = __SMLAD(input1, input2, acc2);
00405 
00406         /* Read x[6] sample */
00407         x2 = *px++;
00408 
00409         /* x[5] and x[6] are packed */
00410         in1 = (q15_t) x1;
00411         in2 = (q15_t) x2;
00412 
00413         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00414 
00415         /* acc3 += x[5] * y[2] + x[6] * y[3]  */
00416         acc3 = __SMLAD(input1, input2, acc3);
00417 
00418       } while (--k);
00419 
00420       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
00421        ** No loop unrolling is used. */
00422       k = srcBLen % 0x4U;
00423 
00424       while (k > 0U)
00425       {
00426         /* Read y[4] sample */
00427         c0 = *py++;
00428 
00429         /* Read x[7] sample */
00430         x3 = *px++;
00431 
00432         /* Perform the multiply-accumulates */
00433         /* acc0 +=  x[4] * y[4] */
00434         acc0 += ((q15_t) x0 * c0);
00435         /* acc1 +=  x[5] * y[4] */
00436         acc1 += ((q15_t) x1 * c0);
00437         /* acc2 +=  x[6] * y[4] */
00438         acc2 += ((q15_t) x2 * c0);
00439         /* acc3 +=  x[7] * y[4] */
00440         acc3 += ((q15_t) x3 * c0);
00441 
00442         /* Reuse the present samples for the next MAC */
00443         x0 = x1;
00444         x1 = x2;
00445         x2 = x3;
00446 
00447         /* Decrement the loop counter */
00448         k--;
00449       }
00450 
00451       /* Store the result in the accumulator in the destination buffer. */
00452       *pOut = (q7_t) (__SSAT(acc0 >> 7, 8));
00453       /* Destination pointer is updated according to the address modifier, inc */
00454       pOut += inc;
00455 
00456       *pOut = (q7_t) (__SSAT(acc1 >> 7, 8));
00457       pOut += inc;
00458 
00459       *pOut = (q7_t) (__SSAT(acc2 >> 7, 8));
00460       pOut += inc;
00461 
00462       *pOut = (q7_t) (__SSAT(acc3 >> 7, 8));
00463       pOut += inc;
00464 
00465       count += 4U;
00466       /* Update the inputA and inputB pointers for next MAC calculation */
00467       px = pIn1 + count;
00468       py = pIn2;
00469 
00470       /* Decrement the loop counter */
00471       blkCnt--;
00472     }
00473 
00474     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
00475      ** No loop unrolling is used. */
00476     blkCnt = blockSize2 % 0x4U;
00477 
00478     while (blkCnt > 0U)
00479     {
00480       /* Accumulator is made zero for every iteration */
00481       sum = 0;
00482 
00483       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00484       k = srcBLen >> 2U;
00485 
00486       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00487        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00488       while (k > 0U)
00489       {
00490         /* Reading two inputs of SrcA buffer and packing */
00491         in1 = (q15_t) * px++;
00492         in2 = (q15_t) * px++;
00493         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00494 
00495         /* Reading two inputs of SrcB buffer and packing */
00496         in1 = (q15_t) * py++;
00497         in2 = (q15_t) * py++;
00498         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00499 
00500         /* Perform the multiply-accumulates */
00501         sum = __SMLAD(input1, input2, sum);
00502 
00503         /* Reading two inputs of SrcA buffer and packing */
00504         in1 = (q15_t) * px++;
00505         in2 = (q15_t) * px++;
00506         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00507 
00508         /* Reading two inputs of SrcB buffer and packing */
00509         in1 = (q15_t) * py++;
00510         in2 = (q15_t) * py++;
00511         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00512 
00513         /* Perform the multiply-accumulates */
00514         sum = __SMLAD(input1, input2, sum);
00515 
00516         /* Decrement the loop counter */
00517         k--;
00518       }
00519 
00520       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
00521        ** No loop unrolling is used. */
00522       k = srcBLen % 0x4U;
00523 
00524       while (k > 0U)
00525       {
00526         /* Perform the multiply-accumulates */
00527         sum += ((q15_t) * px++ * *py++);
00528 
00529         /* Decrement the loop counter */
00530         k--;
00531       }
00532 
00533       /* Store the result in the accumulator in the destination buffer. */
00534       *pOut = (q7_t) (__SSAT(sum >> 7, 8));
00535       /* Destination pointer is updated according to the address modifier, inc */
00536       pOut += inc;
00537 
00538       /* Increment the pointer pIn1 index, count by 1 */
00539       count++;
00540 
00541       /* Update the inputA and inputB pointers for next MAC calculation */
00542       px = pIn1 + count;
00543       py = pIn2;
00544 
00545       /* Decrement the loop counter */
00546       blkCnt--;
00547     }
00548   }
00549   else
00550   {
00551     /* If the srcBLen is not a multiple of 4,
00552      * the blockSize2 loop cannot be unrolled by 4 */
00553     blkCnt = blockSize2;
00554 
00555     while (blkCnt > 0U)
00556     {
00557       /* Accumulator is made zero for every iteration */
00558       sum = 0;
00559 
00560       /* Loop over srcBLen */
00561       k = srcBLen;
00562 
00563       while (k > 0U)
00564       {
00565         /* Perform the multiply-accumulate */
00566         sum += ((q15_t) * px++ * *py++);
00567 
00568         /* Decrement the loop counter */
00569         k--;
00570       }
00571 
00572       /* Store the result in the accumulator in the destination buffer. */
00573       *pOut = (q7_t) (__SSAT(sum >> 7, 8));
00574       /* Destination pointer is updated according to the address modifier, inc */
00575       pOut += inc;
00576 
00577       /* Increment the MAC count */
00578       count++;
00579 
00580       /* Update the inputA and inputB pointers for next MAC calculation */
00581       px = pIn1 + count;
00582       py = pIn2;
00583 
00584 
00585       /* Decrement the loop counter */
00586       blkCnt--;
00587     }
00588   }
00589 
00590   /* --------------------------
00591    * Initializations of stage3
00592    * -------------------------*/
00593 
00594   /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
00595    * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
00596    * ....
00597    * sum +=  x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
00598    * sum +=  x[srcALen-1] * y[0]
00599    */
00600 
00601   /* In this stage the MAC operations are decreased by 1 for every iteration.
00602      The count variable holds the number of MAC operations performed */
00603   count = srcBLen - 1U;
00604 
00605   /* Working pointer of inputA */
00606   pSrc1 = pIn1 + (srcALen - (srcBLen - 1U));
00607   px = pSrc1;
00608 
00609   /* Working pointer of inputB */
00610   py = pIn2;
00611 
00612   /* -------------------
00613    * Stage3 process
00614    * ------------------*/
00615 
00616   while (blockSize3 > 0U)
00617   {
00618     /* Accumulator is made zero for every iteration */
00619     sum = 0;
00620 
00621     /* Apply loop unrolling and compute 4 MACs simultaneously. */
00622     k = count >> 2U;
00623 
00624     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00625      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00626     while (k > 0U)
00627     {
00628       /* x[srcALen - srcBLen + 1] , x[srcALen - srcBLen + 2]  */
00629       in1 = (q15_t) * px++;
00630       in2 = (q15_t) * px++;
00631       input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00632 
00633       /* y[0] , y[1] */
00634       in1 = (q15_t) * py++;
00635       in2 = (q15_t) * py++;
00636       input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00637 
00638       /* sum += x[srcALen - srcBLen + 1] * y[0] */
00639       /* sum += x[srcALen - srcBLen + 2] * y[1] */
00640       sum = __SMLAD(input1, input2, sum);
00641 
00642       /* x[srcALen - srcBLen + 3] , x[srcALen - srcBLen + 4] */
00643       in1 = (q15_t) * px++;
00644       in2 = (q15_t) * px++;
00645       input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00646 
00647       /* y[2] , y[3] */
00648       in1 = (q15_t) * py++;
00649       in2 = (q15_t) * py++;
00650       input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00651 
00652       /* sum += x[srcALen - srcBLen + 3] * y[2] */
00653       /* sum += x[srcALen - srcBLen + 4] * y[3] */
00654       sum = __SMLAD(input1, input2, sum);
00655 
00656       /* Decrement the loop counter */
00657       k--;
00658     }
00659 
00660     /* If the count is not a multiple of 4, compute any remaining MACs here.
00661      ** No loop unrolling is used. */
00662     k = count % 0x4U;
00663 
00664     while (k > 0U)
00665     {
00666       /* Perform the multiply-accumulates */
00667       sum += ((q15_t) * px++ * *py++);
00668 
00669       /* Decrement the loop counter */
00670       k--;
00671     }
00672 
00673     /* Store the result in the accumulator in the destination buffer. */
00674     *pOut = (q7_t) (__SSAT(sum >> 7, 8));
00675     /* Destination pointer is updated according to the address modifier, inc */
00676     pOut += inc;
00677 
00678     /* Update the inputA and inputB pointers for next MAC calculation */
00679     px = ++pSrc1;
00680     py = pIn2;
00681 
00682     /* Decrement the MAC count */
00683     count--;
00684 
00685     /* Decrement the loop counter */
00686     blockSize3--;
00687   }
00688 
00689 #else
00690 
00691 /* Run the below code for Cortex-M0 */
00692 
00693   q7_t *pIn1 = pSrcA;                            /* inputA pointer */
00694   q7_t *pIn2 = pSrcB + (srcBLen - 1U);           /* inputB pointer */
00695   q31_t sum;                                     /* Accumulator */
00696   uint32_t i = 0U, j;                            /* loop counters */
00697   uint32_t inv = 0U;                             /* Reverse order flag */
00698   uint32_t tot = 0U;                             /* Length */
00699 
00700   /* The algorithm implementation is based on the lengths of the inputs. */
00701   /* srcB is always made to slide across srcA. */
00702   /* So srcBLen is always considered as shorter or equal to srcALen */
00703   /* But CORR(x, y) is reverse of CORR(y, x) */
00704   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
00705   /* and a varaible, inv is set to 1 */
00706   /* If lengths are not equal then zero pad has to be done to  make the two
00707    * inputs of same length. But to improve the performance, we include zeroes
00708    * in the output instead of zero padding either of the the inputs*/
00709   /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the
00710    * starting of the output buffer */
00711   /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the
00712    * ending of the output buffer */
00713   /* Once the zero padding is done the remaining of the output is calcualted
00714    * using convolution but with the shorter signal time shifted. */
00715 
00716   /* Calculate the length of the remaining sequence */
00717   tot = ((srcALen + srcBLen) - 2U);
00718 
00719   if (srcALen > srcBLen)
00720   {
00721     /* Calculating the number of zeros to be padded to the output */
00722     j = srcALen - srcBLen;
00723 
00724     /* Initialise the pointer after zero padding */
00725     pDst += j;
00726   }
00727 
00728   else if (srcALen < srcBLen)
00729   {
00730     /* Initialization to inputB pointer */
00731     pIn1 = pSrcB;
00732 
00733     /* Initialization to the end of inputA pointer */
00734     pIn2 = pSrcA + (srcALen - 1U);
00735 
00736     /* Initialisation of the pointer after zero padding */
00737     pDst = pDst + tot;
00738 
00739     /* Swapping the lengths */
00740     j = srcALen;
00741     srcALen = srcBLen;
00742     srcBLen = j;
00743 
00744     /* Setting the reverse flag */
00745     inv = 1;
00746 
00747   }
00748 
00749   /* Loop to calculate convolution for output length number of times */
00750   for (i = 0U; i <= tot; i++)
00751   {
00752     /* Initialize sum with zero to carry on MAC operations */
00753     sum = 0;
00754 
00755     /* Loop to perform MAC operations according to convolution equation */
00756     for (j = 0U; j <= i; j++)
00757     {
00758       /* Check the array limitations */
00759       if ((((i - j) < srcBLen) && (j < srcALen)))
00760       {
00761         /* z[i] += x[i-j] * y[j] */
00762         sum += ((q15_t) pIn1[j] * pIn2[-((int32_t) i - j)]);
00763       }
00764     }
00765     /* Store the output in the destination buffer */
00766     if (inv == 1)
00767       *pDst-- = (q7_t) __SSAT((sum >> 7U), 8U);
00768     else
00769       *pDst++ = (q7_t) __SSAT((sum >> 7U), 8U);
00770   }
00771 
00772 #endif /*   #if defined (ARM_MATH_DSP) */
00773 
00774 }
00775 
00776 /**
00777  * @} end of Corr group
00778  */
00779