Aded CMSIS5 DSP and NN folder. Needs some work

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_correlate_fast_q15.c Source File

arm_correlate_fast_q15.c

00001 /* ----------------------------------------------------------------------
00002  * Project:      CMSIS DSP Library
00003  * Title:        arm_correlate_fast_q15.c
00004  * Description:  Fast Q15 Correlation
00005  *
00006  * $Date:        27. January 2017
00007  * $Revision:    V.1.5.1
00008  *
00009  * Target Processor: Cortex-M cores
00010  * -------------------------------------------------------------------- */
00011 /*
00012  * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
00013  *
00014  * SPDX-License-Identifier: Apache-2.0
00015  *
00016  * Licensed under the Apache License, Version 2.0 (the License); you may
00017  * not use this file except in compliance with the License.
00018  * You may obtain a copy of the License at
00019  *
00020  * www.apache.org/licenses/LICENSE-2.0
00021  *
00022  * Unless required by applicable law or agreed to in writing, software
00023  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
00024  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00025  * See the License for the specific language governing permissions and
00026  * limitations under the License.
00027  */
00028 
00029 #include "arm_math.h"
00030 
00031 /**
00032  * @ingroup groupFilters
00033  */
00034 
00035 /**
00036  * @addtogroup Corr
00037  * @{
00038  */
00039 
00040 /**
00041  * @brief Correlation of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4.
00042  * @param[in] *pSrcA points to the first input sequence.
00043  * @param[in] srcALen length of the first input sequence.
00044  * @param[in] *pSrcB points to the second input sequence.
00045  * @param[in] srcBLen length of the second input sequence.
00046  * @param[out] *pDst points to the location where the output result is written.  Length 2 * max(srcALen, srcBLen) - 1.
00047  * @return none.
00048  *
00049  * <b>Scaling and Overflow Behavior:</b>
00050  *
00051  * \par
00052  * This fast version uses a 32-bit accumulator with 2.30 format.
00053  * The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit.
00054  * There is no saturation on intermediate additions.
00055  * Thus, if the accumulator overflows it wraps around and distorts the result.
00056  * The input signals should be scaled down to avoid intermediate overflows.
00057  * Scale down one of the inputs by 1/min(srcALen, srcBLen) to avoid overflow since a
00058  * maximum of min(srcALen, srcBLen) number of additions is carried internally.
00059  * The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result.
00060  *
00061  * \par
00062  * See <code>arm_correlate_q15()</code> for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion.
00063  */
00064 
00065 void arm_correlate_fast_q15(
00066   q15_t * pSrcA,
00067   uint32_t srcALen,
00068   q15_t * pSrcB,
00069   uint32_t srcBLen,
00070   q15_t * pDst)
00071 {
00072 #ifndef UNALIGNED_SUPPORT_DISABLE
00073 
00074   q15_t *pIn1;                                   /* inputA pointer               */
00075   q15_t *pIn2;                                   /* inputB pointer               */
00076   q15_t *pOut = pDst;                            /* output pointer               */
00077   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulators                  */
00078   q15_t *px;                                     /* Intermediate inputA pointer  */
00079   q15_t *py;                                     /* Intermediate inputB pointer  */
00080   q15_t *pSrc1;                                  /* Intermediate pointers        */
00081   q31_t x0, x1, x2, x3, c0;                      /* temporary variables for holding input and coefficient values */
00082   uint32_t j, k = 0U, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3;  /* loop counter                 */
00083   int32_t inc = 1;                               /* Destination address modifier */
00084 
00085 
00086   /* The algorithm implementation is based on the lengths of the inputs. */
00087   /* srcB is always made to slide across srcA. */
00088   /* So srcBLen is always considered as shorter or equal to srcALen */
00089   /* But CORR(x, y) is reverse of CORR(y, x) */
00090   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
00091   /* and the destination pointer modifier, inc is set to -1 */
00092   /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
00093   /* But to improve the performance,
00094    * we include zeroes in the output instead of zero padding either of the the inputs*/
00095   /* If srcALen > srcBLen,
00096    * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
00097   /* If srcALen < srcBLen,
00098    * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
00099   if (srcALen >= srcBLen)
00100   {
00101     /* Initialization of inputA pointer */
00102     pIn1 = (pSrcA);
00103 
00104     /* Initialization of inputB pointer */
00105     pIn2 = (pSrcB);
00106 
00107     /* Number of output samples is calculated */
00108     outBlockSize = (2U * srcALen) - 1U;
00109 
00110     /* When srcALen > srcBLen, zero padding is done to srcB
00111      * to make their lengths equal.
00112      * Instead, (outBlockSize - (srcALen + srcBLen - 1))
00113      * number of output samples are made zero */
00114     j = outBlockSize - (srcALen + (srcBLen - 1U));
00115 
00116     /* Updating the pointer position to non zero value */
00117     pOut += j;
00118 
00119   }
00120   else
00121   {
00122     /* Initialization of inputA pointer */
00123     pIn1 = (pSrcB);
00124 
00125     /* Initialization of inputB pointer */
00126     pIn2 = (pSrcA);
00127 
00128     /* srcBLen is always considered as shorter or equal to srcALen */
00129     j = srcBLen;
00130     srcBLen = srcALen;
00131     srcALen = j;
00132 
00133     /* CORR(x, y) = Reverse order(CORR(y, x)) */
00134     /* Hence set the destination pointer to point to the last output sample */
00135     pOut = pDst + ((srcALen + srcBLen) - 2U);
00136 
00137     /* Destination address modifier is set to -1 */
00138     inc = -1;
00139 
00140   }
00141 
00142   /* The function is internally
00143    * divided into three parts according to the number of multiplications that has to be
00144    * taken place between inputA samples and inputB samples. In the first part of the
00145    * algorithm, the multiplications increase by one for every iteration.
00146    * In the second part of the algorithm, srcBLen number of multiplications are done.
00147    * In the third part of the algorithm, the multiplications decrease by one
00148    * for every iteration.*/
00149   /* The algorithm is implemented in three stages.
00150    * The loop counters of each stage is initiated here. */
00151   blockSize1 = srcBLen - 1U;
00152   blockSize2 = srcALen - (srcBLen - 1U);
00153   blockSize3 = blockSize1;
00154 
00155   /* --------------------------
00156    * Initializations of stage1
00157    * -------------------------*/
00158 
00159   /* sum = x[0] * y[srcBlen - 1]
00160    * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]
00161    * ....
00162    * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
00163    */
00164 
00165   /* In this stage the MAC operations are increased by 1 for every iteration.
00166      The count variable holds the number of MAC operations performed */
00167   count = 1U;
00168 
00169   /* Working pointer of inputA */
00170   px = pIn1;
00171 
00172   /* Working pointer of inputB */
00173   pSrc1 = pIn2 + (srcBLen - 1U);
00174   py = pSrc1;
00175 
00176   /* ------------------------
00177    * Stage1 process
00178    * ----------------------*/
00179 
00180   /* The first loop starts here */
00181   while (blockSize1 > 0U)
00182   {
00183     /* Accumulator is made zero for every iteration */
00184     sum = 0;
00185 
00186     /* Apply loop unrolling and compute 4 MACs simultaneously. */
00187     k = count >> 2;
00188 
00189     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00190      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00191     while (k > 0U)
00192     {
00193       /* x[0] * y[srcBLen - 4] , x[1] * y[srcBLen - 3] */
00194       sum = __SMLAD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
00195       /* x[3] * y[srcBLen - 1] , x[2] * y[srcBLen - 2] */
00196       sum = __SMLAD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
00197 
00198       /* Decrement the loop counter */
00199       k--;
00200     }
00201 
00202     /* If the count is not a multiple of 4, compute any remaining MACs here.
00203      ** No loop unrolling is used. */
00204     k = count % 0x4U;
00205 
00206     while (k > 0U)
00207     {
00208       /* Perform the multiply-accumulates */
00209       /* x[0] * y[srcBLen - 1] */
00210       sum = __SMLAD(*px++, *py++, sum);
00211 
00212       /* Decrement the loop counter */
00213       k--;
00214     }
00215 
00216     /* Store the result in the accumulator in the destination buffer. */
00217     *pOut = (q15_t) (sum >> 15);
00218     /* Destination pointer is updated according to the address modifier, inc */
00219     pOut += inc;
00220 
00221     /* Update the inputA and inputB pointers for next MAC calculation */
00222     py = pSrc1 - count;
00223     px = pIn1;
00224 
00225     /* Increment the MAC count */
00226     count++;
00227 
00228     /* Decrement the loop counter */
00229     blockSize1--;
00230   }
00231 
00232   /* --------------------------
00233    * Initializations of stage2
00234    * ------------------------*/
00235 
00236   /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
00237    * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
00238    * ....
00239    * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
00240    */
00241 
00242   /* Working pointer of inputA */
00243   px = pIn1;
00244 
00245   /* Working pointer of inputB */
00246   py = pIn2;
00247 
00248   /* count is index by which the pointer pIn1 to be incremented */
00249   count = 0U;
00250 
00251   /* -------------------
00252    * Stage2 process
00253    * ------------------*/
00254 
00255   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
00256    * So, to loop unroll over blockSize2,
00257    * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */
00258   if (srcBLen >= 4U)
00259   {
00260     /* Loop unroll over blockSize2, by 4 */
00261     blkCnt = blockSize2 >> 2U;
00262 
00263     while (blkCnt > 0U)
00264     {
00265       /* Set all accumulators to zero */
00266       acc0 = 0;
00267       acc1 = 0;
00268       acc2 = 0;
00269       acc3 = 0;
00270 
00271       /* read x[0], x[1] samples */
00272       x0 = *__SIMD32(px);
00273       /* read x[1], x[2] samples */
00274       x1 = _SIMD32_OFFSET(px + 1);
00275       px += 2U;
00276 
00277       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00278       k = srcBLen >> 2U;
00279 
00280       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00281        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00282       do
00283       {
00284         /* Read the first two inputB samples using SIMD:
00285          * y[0] and y[1] */
00286         c0 = *__SIMD32(py)++;
00287 
00288         /* acc0 +=  x[0] * y[0] + x[1] * y[1] */
00289         acc0 = __SMLAD(x0, c0, acc0);
00290 
00291         /* acc1 +=  x[1] * y[0] + x[2] * y[1] */
00292         acc1 = __SMLAD(x1, c0, acc1);
00293 
00294         /* Read x[2], x[3] */
00295         x2 = *__SIMD32(px);
00296 
00297         /* Read x[3], x[4] */
00298         x3 = _SIMD32_OFFSET(px + 1);
00299 
00300         /* acc2 +=  x[2] * y[0] + x[3] * y[1] */
00301         acc2 = __SMLAD(x2, c0, acc2);
00302 
00303         /* acc3 +=  x[3] * y[0] + x[4] * y[1] */
00304         acc3 = __SMLAD(x3, c0, acc3);
00305 
00306         /* Read y[2] and y[3] */
00307         c0 = *__SIMD32(py)++;
00308 
00309         /* acc0 +=  x[2] * y[2] + x[3] * y[3] */
00310         acc0 = __SMLAD(x2, c0, acc0);
00311 
00312         /* acc1 +=  x[3] * y[2] + x[4] * y[3] */
00313         acc1 = __SMLAD(x3, c0, acc1);
00314 
00315         /* Read x[4], x[5] */
00316         x0 = _SIMD32_OFFSET(px + 2);
00317 
00318         /* Read x[5], x[6] */
00319         x1 = _SIMD32_OFFSET(px + 3);
00320         px += 4U;
00321 
00322         /* acc2 +=  x[4] * y[2] + x[5] * y[3] */
00323         acc2 = __SMLAD(x0, c0, acc2);
00324 
00325         /* acc3 +=  x[5] * y[2] + x[6] * y[3] */
00326         acc3 = __SMLAD(x1, c0, acc3);
00327 
00328       } while (--k);
00329 
00330       /* For the next MAC operations, SIMD is not used
00331        * So, the 16 bit pointer if inputB, py is updated */
00332 
00333       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
00334        ** No loop unrolling is used. */
00335       k = srcBLen % 0x4U;
00336 
00337       if (k == 1U)
00338       {
00339         /* Read y[4] */
00340         c0 = *py;
00341 #ifdef  ARM_MATH_BIG_ENDIAN
00342 
00343         c0 = c0 << 16U;
00344 
00345 #else
00346 
00347         c0 = c0 & 0x0000FFFF;
00348 
00349 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
00350 
00351         /* Read x[7] */
00352         x3 = *__SIMD32(px);
00353         px++;
00354 
00355         /* Perform the multiply-accumulates */
00356         acc0 = __SMLAD(x0, c0, acc0);
00357         acc1 = __SMLAD(x1, c0, acc1);
00358         acc2 = __SMLADX(x1, c0, acc2);
00359         acc3 = __SMLADX(x3, c0, acc3);
00360       }
00361 
00362       if (k == 2U)
00363       {
00364         /* Read y[4], y[5] */
00365         c0 = *__SIMD32(py);
00366 
00367         /* Read x[7], x[8] */
00368         x3 = *__SIMD32(px);
00369 
00370         /* Read x[9] */
00371         x2 = _SIMD32_OFFSET(px + 1);
00372         px += 2U;
00373 
00374         /* Perform the multiply-accumulates */
00375         acc0 = __SMLAD(x0, c0, acc0);
00376         acc1 = __SMLAD(x1, c0, acc1);
00377         acc2 = __SMLAD(x3, c0, acc2);
00378         acc3 = __SMLAD(x2, c0, acc3);
00379       }
00380 
00381       if (k == 3U)
00382       {
00383         /* Read y[4], y[5] */
00384         c0 = *__SIMD32(py)++;
00385 
00386         /* Read x[7], x[8] */
00387         x3 = *__SIMD32(px);
00388 
00389         /* Read x[9] */
00390         x2 = _SIMD32_OFFSET(px + 1);
00391 
00392         /* Perform the multiply-accumulates */
00393         acc0 = __SMLAD(x0, c0, acc0);
00394         acc1 = __SMLAD(x1, c0, acc1);
00395         acc2 = __SMLAD(x3, c0, acc2);
00396         acc3 = __SMLAD(x2, c0, acc3);
00397 
00398         c0 = (*py);
00399         /* Read y[6] */
00400 #ifdef  ARM_MATH_BIG_ENDIAN
00401 
00402         c0 = c0 << 16U;
00403 #else
00404 
00405         c0 = c0 & 0x0000FFFF;
00406 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
00407 
00408         /* Read x[10] */
00409         x3 = _SIMD32_OFFSET(px + 2);
00410         px += 3U;
00411 
00412         /* Perform the multiply-accumulates */
00413         acc0 = __SMLADX(x1, c0, acc0);
00414         acc1 = __SMLAD(x2, c0, acc1);
00415         acc2 = __SMLADX(x2, c0, acc2);
00416         acc3 = __SMLADX(x3, c0, acc3);
00417       }
00418 
00419       /* Store the result in the accumulator in the destination buffer. */
00420       *pOut = (q15_t) (acc0 >> 15);
00421       /* Destination pointer is updated according to the address modifier, inc */
00422       pOut += inc;
00423 
00424       *pOut = (q15_t) (acc1 >> 15);
00425       pOut += inc;
00426 
00427       *pOut = (q15_t) (acc2 >> 15);
00428       pOut += inc;
00429 
00430       *pOut = (q15_t) (acc3 >> 15);
00431       pOut += inc;
00432 
00433       /* Increment the pointer pIn1 index, count by 1 */
00434       count += 4U;
00435 
00436       /* Update the inputA and inputB pointers for next MAC calculation */
00437       px = pIn1 + count;
00438       py = pIn2;
00439 
00440 
00441       /* Decrement the loop counter */
00442       blkCnt--;
00443     }
00444 
00445     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
00446      ** No loop unrolling is used. */
00447     blkCnt = blockSize2 % 0x4U;
00448 
00449     while (blkCnt > 0U)
00450     {
00451       /* Accumulator is made zero for every iteration */
00452       sum = 0;
00453 
00454       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00455       k = srcBLen >> 2U;
00456 
00457       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00458        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00459       while (k > 0U)
00460       {
00461         /* Perform the multiply-accumulates */
00462         sum += ((q31_t) * px++ * *py++);
00463         sum += ((q31_t) * px++ * *py++);
00464         sum += ((q31_t) * px++ * *py++);
00465         sum += ((q31_t) * px++ * *py++);
00466 
00467         /* Decrement the loop counter */
00468         k--;
00469       }
00470 
00471       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
00472        ** No loop unrolling is used. */
00473       k = srcBLen % 0x4U;
00474 
00475       while (k > 0U)
00476       {
00477         /* Perform the multiply-accumulates */
00478         sum += ((q31_t) * px++ * *py++);
00479 
00480         /* Decrement the loop counter */
00481         k--;
00482       }
00483 
00484       /* Store the result in the accumulator in the destination buffer. */
00485       *pOut = (q15_t) (sum >> 15);
00486       /* Destination pointer is updated according to the address modifier, inc */
00487       pOut += inc;
00488 
00489       /* Increment the pointer pIn1 index, count by 1 */
00490       count++;
00491 
00492       /* Update the inputA and inputB pointers for next MAC calculation */
00493       px = pIn1 + count;
00494       py = pIn2;
00495 
00496       /* Decrement the loop counter */
00497       blkCnt--;
00498     }
00499   }
00500   else
00501   {
00502     /* If the srcBLen is not a multiple of 4,
00503      * the blockSize2 loop cannot be unrolled by 4 */
00504     blkCnt = blockSize2;
00505 
00506     while (blkCnt > 0U)
00507     {
00508       /* Accumulator is made zero for every iteration */
00509       sum = 0;
00510 
00511       /* Loop over srcBLen */
00512       k = srcBLen;
00513 
00514       while (k > 0U)
00515       {
00516         /* Perform the multiply-accumulate */
00517         sum += ((q31_t) * px++ * *py++);
00518 
00519         /* Decrement the loop counter */
00520         k--;
00521       }
00522 
00523       /* Store the result in the accumulator in the destination buffer. */
00524       *pOut = (q15_t) (sum >> 15);
00525       /* Destination pointer is updated according to the address modifier, inc */
00526       pOut += inc;
00527 
00528       /* Increment the MAC count */
00529       count++;
00530 
00531       /* Update the inputA and inputB pointers for next MAC calculation */
00532       px = pIn1 + count;
00533       py = pIn2;
00534 
00535       /* Decrement the loop counter */
00536       blkCnt--;
00537     }
00538   }
00539 
00540   /* --------------------------
00541    * Initializations of stage3
00542    * -------------------------*/
00543 
00544   /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
00545    * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
00546    * ....
00547    * sum +=  x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
00548    * sum +=  x[srcALen-1] * y[0]
00549    */
00550 
00551   /* In this stage the MAC operations are decreased by 1 for every iteration.
00552      The count variable holds the number of MAC operations performed */
00553   count = srcBLen - 1U;
00554 
00555   /* Working pointer of inputA */
00556   pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
00557   px = pSrc1;
00558 
00559   /* Working pointer of inputB */
00560   py = pIn2;
00561 
00562   /* -------------------
00563    * Stage3 process
00564    * ------------------*/
00565 
00566   while (blockSize3 > 0U)
00567   {
00568     /* Accumulator is made zero for every iteration */
00569     sum = 0;
00570 
00571     /* Apply loop unrolling and compute 4 MACs simultaneously. */
00572     k = count >> 2U;
00573 
00574     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00575      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00576     while (k > 0U)
00577     {
00578       /* Perform the multiply-accumulates */
00579       /* sum += x[srcALen - srcBLen + 4] * y[3] , sum += x[srcALen - srcBLen + 3] * y[2] */
00580       sum = __SMLAD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
00581       /* sum += x[srcALen - srcBLen + 2] * y[1] , sum += x[srcALen - srcBLen + 1] * y[0] */
00582       sum = __SMLAD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
00583 
00584       /* Decrement the loop counter */
00585       k--;
00586     }
00587 
00588     /* If the count is not a multiple of 4, compute any remaining MACs here.
00589      ** No loop unrolling is used. */
00590     k = count % 0x4U;
00591 
00592     while (k > 0U)
00593     {
00594       /* Perform the multiply-accumulates */
00595       sum = __SMLAD(*px++, *py++, sum);
00596 
00597       /* Decrement the loop counter */
00598       k--;
00599     }
00600 
00601     /* Store the result in the accumulator in the destination buffer. */
00602     *pOut = (q15_t) (sum >> 15);
00603     /* Destination pointer is updated according to the address modifier, inc */
00604     pOut += inc;
00605 
00606     /* Update the inputA and inputB pointers for next MAC calculation */
00607     px = ++pSrc1;
00608     py = pIn2;
00609 
00610     /* Decrement the MAC count */
00611     count--;
00612 
00613     /* Decrement the loop counter */
00614     blockSize3--;
00615   }
00616 
00617 #else
00618 
00619   q15_t *pIn1;                                   /* inputA pointer               */
00620   q15_t *pIn2;                                   /* inputB pointer               */
00621   q15_t *pOut = pDst;                            /* output pointer               */
00622   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulators                  */
00623   q15_t *px;                                     /* Intermediate inputA pointer  */
00624   q15_t *py;                                     /* Intermediate inputB pointer  */
00625   q15_t *pSrc1;                                  /* Intermediate pointers        */
00626   q31_t x0, x1, x2, x3, c0;                      /* temporary variables for holding input and coefficient values */
00627   uint32_t j, k = 0U, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3;  /* loop counter                 */
00628   int32_t inc = 1;                               /* Destination address modifier */
00629   q15_t a, b;
00630 
00631 
00632   /* The algorithm implementation is based on the lengths of the inputs. */
00633   /* srcB is always made to slide across srcA. */
00634   /* So srcBLen is always considered as shorter or equal to srcALen */
00635   /* But CORR(x, y) is reverse of CORR(y, x) */
00636   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
00637   /* and the destination pointer modifier, inc is set to -1 */
00638   /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
00639   /* But to improve the performance,
00640    * we include zeroes in the output instead of zero padding either of the the inputs*/
00641   /* If srcALen > srcBLen,
00642    * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
00643   /* If srcALen < srcBLen,
00644    * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
00645   if (srcALen >= srcBLen)
00646   {
00647     /* Initialization of inputA pointer */
00648     pIn1 = (pSrcA);
00649 
00650     /* Initialization of inputB pointer */
00651     pIn2 = (pSrcB);
00652 
00653     /* Number of output samples is calculated */
00654     outBlockSize = (2U * srcALen) - 1U;
00655 
00656     /* When srcALen > srcBLen, zero padding is done to srcB
00657      * to make their lengths equal.
00658      * Instead, (outBlockSize - (srcALen + srcBLen - 1))
00659      * number of output samples are made zero */
00660     j = outBlockSize - (srcALen + (srcBLen - 1U));
00661 
00662     /* Updating the pointer position to non zero value */
00663     pOut += j;
00664 
00665   }
00666   else
00667   {
00668     /* Initialization of inputA pointer */
00669     pIn1 = (pSrcB);
00670 
00671     /* Initialization of inputB pointer */
00672     pIn2 = (pSrcA);
00673 
00674     /* srcBLen is always considered as shorter or equal to srcALen */
00675     j = srcBLen;
00676     srcBLen = srcALen;
00677     srcALen = j;
00678 
00679     /* CORR(x, y) = Reverse order(CORR(y, x)) */
00680     /* Hence set the destination pointer to point to the last output sample */
00681     pOut = pDst + ((srcALen + srcBLen) - 2U);
00682 
00683     /* Destination address modifier is set to -1 */
00684     inc = -1;
00685 
00686   }
00687 
00688   /* The function is internally
00689    * divided into three parts according to the number of multiplications that has to be
00690    * taken place between inputA samples and inputB samples. In the first part of the
00691    * algorithm, the multiplications increase by one for every iteration.
00692    * In the second part of the algorithm, srcBLen number of multiplications are done.
00693    * In the third part of the algorithm, the multiplications decrease by one
00694    * for every iteration.*/
00695   /* The algorithm is implemented in three stages.
00696    * The loop counters of each stage is initiated here. */
00697   blockSize1 = srcBLen - 1U;
00698   blockSize2 = srcALen - (srcBLen - 1U);
00699   blockSize3 = blockSize1;
00700 
00701   /* --------------------------
00702    * Initializations of stage1
00703    * -------------------------*/
00704 
00705   /* sum = x[0] * y[srcBlen - 1]
00706    * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]
00707    * ....
00708    * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
00709    */
00710 
00711   /* In this stage the MAC operations are increased by 1 for every iteration.
00712      The count variable holds the number of MAC operations performed */
00713   count = 1U;
00714 
00715   /* Working pointer of inputA */
00716   px = pIn1;
00717 
00718   /* Working pointer of inputB */
00719   pSrc1 = pIn2 + (srcBLen - 1U);
00720   py = pSrc1;
00721 
00722   /* ------------------------
00723    * Stage1 process
00724    * ----------------------*/
00725 
00726   /* The first loop starts here */
00727   while (blockSize1 > 0U)
00728   {
00729     /* Accumulator is made zero for every iteration */
00730     sum = 0;
00731 
00732     /* Apply loop unrolling and compute 4 MACs simultaneously. */
00733     k = count >> 2;
00734 
00735     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00736      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00737     while (k > 0U)
00738     {
00739       /* x[0] * y[srcBLen - 4] , x[1] * y[srcBLen - 3] */
00740         sum += ((q31_t) * px++ * *py++);
00741         sum += ((q31_t) * px++ * *py++);
00742         sum += ((q31_t) * px++ * *py++);
00743         sum += ((q31_t) * px++ * *py++);
00744 
00745       /* Decrement the loop counter */
00746       k--;
00747     }
00748 
00749     /* If the count is not a multiple of 4, compute any remaining MACs here.
00750      ** No loop unrolling is used. */
00751     k = count % 0x4U;
00752 
00753     while (k > 0U)
00754     {
00755       /* Perform the multiply-accumulates */
00756       /* x[0] * y[srcBLen - 1] */
00757         sum += ((q31_t) * px++ * *py++);
00758 
00759       /* Decrement the loop counter */
00760       k--;
00761     }
00762 
00763     /* Store the result in the accumulator in the destination buffer. */
00764     *pOut = (q15_t) (sum >> 15);
00765     /* Destination pointer is updated according to the address modifier, inc */
00766     pOut += inc;
00767 
00768     /* Update the inputA and inputB pointers for next MAC calculation */
00769     py = pSrc1 - count;
00770     px = pIn1;
00771 
00772     /* Increment the MAC count */
00773     count++;
00774 
00775     /* Decrement the loop counter */
00776     blockSize1--;
00777   }
00778 
00779   /* --------------------------
00780    * Initializations of stage2
00781    * ------------------------*/
00782 
00783   /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
00784    * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
00785    * ....
00786    * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
00787    */
00788 
00789   /* Working pointer of inputA */
00790   px = pIn1;
00791 
00792   /* Working pointer of inputB */
00793   py = pIn2;
00794 
00795   /* count is index by which the pointer pIn1 to be incremented */
00796   count = 0U;
00797 
00798   /* -------------------
00799    * Stage2 process
00800    * ------------------*/
00801 
00802   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
00803    * So, to loop unroll over blockSize2,
00804    * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */
00805   if (srcBLen >= 4U)
00806   {
00807     /* Loop unroll over blockSize2, by 4 */
00808     blkCnt = blockSize2 >> 2U;
00809 
00810     while (blkCnt > 0U)
00811     {
00812       /* Set all accumulators to zero */
00813       acc0 = 0;
00814       acc1 = 0;
00815       acc2 = 0;
00816       acc3 = 0;
00817 
00818       /* read x[0], x[1], x[2] samples */
00819       a = *px;
00820       b = *(px + 1);
00821 
00822 #ifndef ARM_MATH_BIG_ENDIAN
00823 
00824       x0 = __PKHBT(a, b, 16);
00825       a = *(px + 2);
00826       x1 = __PKHBT(b, a, 16);
00827 
00828 #else
00829 
00830       x0 = __PKHBT(b, a, 16);
00831       a = *(px + 2);
00832       x1 = __PKHBT(a, b, 16);
00833 
00834 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN */
00835 
00836       px += 2U;
00837 
00838       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00839       k = srcBLen >> 2U;
00840 
00841       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00842        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00843       do
00844       {
00845         /* Read the first two inputB samples using SIMD:
00846          * y[0] and y[1] */
00847           a = *py;
00848           b = *(py + 1);
00849 
00850 #ifndef ARM_MATH_BIG_ENDIAN
00851 
00852           c0 = __PKHBT(a, b, 16);
00853 
00854 #else
00855 
00856           c0 = __PKHBT(b, a, 16);
00857 
00858 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN */
00859 
00860         /* acc0 +=  x[0] * y[0] + x[1] * y[1] */
00861         acc0 = __SMLAD(x0, c0, acc0);
00862 
00863         /* acc1 +=  x[1] * y[0] + x[2] * y[1] */
00864         acc1 = __SMLAD(x1, c0, acc1);
00865 
00866         /* Read x[2], x[3], x[4] */
00867         a = *px;
00868         b = *(px + 1);
00869 
00870 #ifndef ARM_MATH_BIG_ENDIAN
00871 
00872         x2 = __PKHBT(a, b, 16);
00873         a = *(px + 2);
00874         x3 = __PKHBT(b, a, 16);
00875 
00876 #else
00877 
00878         x2 = __PKHBT(b, a, 16);
00879         a = *(px + 2);
00880         x3 = __PKHBT(a, b, 16);
00881 
00882 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN */
00883 
00884         /* acc2 +=  x[2] * y[0] + x[3] * y[1] */
00885         acc2 = __SMLAD(x2, c0, acc2);
00886 
00887         /* acc3 +=  x[3] * y[0] + x[4] * y[1] */
00888         acc3 = __SMLAD(x3, c0, acc3);
00889 
00890         /* Read y[2] and y[3] */
00891           a = *(py + 2);
00892           b = *(py + 3);
00893 
00894           py += 4U;
00895 
00896 #ifndef ARM_MATH_BIG_ENDIAN
00897 
00898           c0 = __PKHBT(a, b, 16);
00899 
00900 #else
00901 
00902           c0 = __PKHBT(b, a, 16);
00903 
00904 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN */
00905 
00906         /* acc0 +=  x[2] * y[2] + x[3] * y[3] */
00907         acc0 = __SMLAD(x2, c0, acc0);
00908 
00909         /* acc1 +=  x[3] * y[2] + x[4] * y[3] */
00910         acc1 = __SMLAD(x3, c0, acc1);
00911 
00912         /* Read x[4], x[5], x[6] */
00913         a = *(px + 2);
00914         b = *(px + 3);
00915 
00916 #ifndef ARM_MATH_BIG_ENDIAN
00917 
00918         x0 = __PKHBT(a, b, 16);
00919         a = *(px + 4);
00920         x1 = __PKHBT(b, a, 16);
00921 
00922 #else
00923 
00924         x0 = __PKHBT(b, a, 16);
00925         a = *(px + 4);
00926         x1 = __PKHBT(a, b, 16);
00927 
00928 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN */
00929 
00930         px += 4U;
00931 
00932         /* acc2 +=  x[4] * y[2] + x[5] * y[3] */
00933         acc2 = __SMLAD(x0, c0, acc2);
00934 
00935         /* acc3 +=  x[5] * y[2] + x[6] * y[3] */
00936         acc3 = __SMLAD(x1, c0, acc3);
00937 
00938       } while (--k);
00939 
00940       /* For the next MAC operations, SIMD is not used
00941        * So, the 16 bit pointer if inputB, py is updated */
00942 
00943       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
00944        ** No loop unrolling is used. */
00945       k = srcBLen % 0x4U;
00946 
00947       if (k == 1U)
00948       {
00949         /* Read y[4] */
00950         c0 = *py;
00951 #ifdef  ARM_MATH_BIG_ENDIAN
00952 
00953         c0 = c0 << 16U;
00954 
00955 #else
00956 
00957         c0 = c0 & 0x0000FFFF;
00958 
00959 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
00960 
00961         /* Read x[7] */
00962         a = *px;
00963         b = *(px + 1);
00964 
00965         px++;;
00966 
00967 #ifndef ARM_MATH_BIG_ENDIAN
00968 
00969         x3 = __PKHBT(a, b, 16);
00970 
00971 #else
00972 
00973         x3 = __PKHBT(b, a, 16);
00974 
00975 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN */
00976 
00977         px++;
00978 
00979         /* Perform the multiply-accumulates */
00980         acc0 = __SMLAD(x0, c0, acc0);
00981         acc1 = __SMLAD(x1, c0, acc1);
00982         acc2 = __SMLADX(x1, c0, acc2);
00983         acc3 = __SMLADX(x3, c0, acc3);
00984       }
00985 
00986       if (k == 2U)
00987       {
00988         /* Read y[4], y[5] */
00989           a = *py;
00990           b = *(py + 1);
00991 
00992 #ifndef ARM_MATH_BIG_ENDIAN
00993 
00994           c0 = __PKHBT(a, b, 16);
00995 
00996 #else
00997 
00998           c0 = __PKHBT(b, a, 16);
00999 
01000 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN */
01001 
01002         /* Read x[7], x[8], x[9] */
01003         a = *px;
01004         b = *(px + 1);
01005 
01006 #ifndef ARM_MATH_BIG_ENDIAN
01007 
01008         x3 = __PKHBT(a, b, 16);
01009         a = *(px + 2);
01010         x2 = __PKHBT(b, a, 16);
01011 
01012 #else
01013 
01014         x3 = __PKHBT(b, a, 16);
01015         a = *(px + 2);
01016         x2 = __PKHBT(a, b, 16);
01017 
01018 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN */
01019 
01020         px += 2U;
01021 
01022         /* Perform the multiply-accumulates */
01023         acc0 = __SMLAD(x0, c0, acc0);
01024         acc1 = __SMLAD(x1, c0, acc1);
01025         acc2 = __SMLAD(x3, c0, acc2);
01026         acc3 = __SMLAD(x2, c0, acc3);
01027       }
01028 
01029       if (k == 3U)
01030       {
01031         /* Read y[4], y[5] */
01032           a = *py;
01033           b = *(py + 1);
01034 
01035 #ifndef ARM_MATH_BIG_ENDIAN
01036 
01037           c0 = __PKHBT(a, b, 16);
01038 
01039 #else
01040 
01041           c0 = __PKHBT(b, a, 16);
01042 
01043 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN */
01044 
01045         py += 2U;
01046 
01047         /* Read x[7], x[8], x[9] */
01048         a = *px;
01049         b = *(px + 1);
01050 
01051 #ifndef ARM_MATH_BIG_ENDIAN
01052 
01053         x3 = __PKHBT(a, b, 16);
01054         a = *(px + 2);
01055         x2 = __PKHBT(b, a, 16);
01056 
01057 #else
01058 
01059         x3 = __PKHBT(b, a, 16);
01060         a = *(px + 2);
01061         x2 = __PKHBT(a, b, 16);
01062 
01063 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN */
01064 
01065         /* Perform the multiply-accumulates */
01066         acc0 = __SMLAD(x0, c0, acc0);
01067         acc1 = __SMLAD(x1, c0, acc1);
01068         acc2 = __SMLAD(x3, c0, acc2);
01069         acc3 = __SMLAD(x2, c0, acc3);
01070 
01071         c0 = (*py);
01072         /* Read y[6] */
01073 #ifdef  ARM_MATH_BIG_ENDIAN
01074 
01075         c0 = c0 << 16U;
01076 #else
01077 
01078         c0 = c0 & 0x0000FFFF;
01079 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
01080 
01081         /* Read x[10] */
01082         b = *(px + 3);
01083 
01084 #ifndef ARM_MATH_BIG_ENDIAN
01085 
01086         x3 = __PKHBT(a, b, 16);
01087 
01088 #else
01089 
01090         x3 = __PKHBT(b, a, 16);
01091 
01092 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN */
01093 
01094         px += 3U;
01095 
01096         /* Perform the multiply-accumulates */
01097         acc0 = __SMLADX(x1, c0, acc0);
01098         acc1 = __SMLAD(x2, c0, acc1);
01099         acc2 = __SMLADX(x2, c0, acc2);
01100         acc3 = __SMLADX(x3, c0, acc3);
01101       }
01102 
01103       /* Store the result in the accumulator in the destination buffer. */
01104       *pOut = (q15_t) (acc0 >> 15);
01105       /* Destination pointer is updated according to the address modifier, inc */
01106       pOut += inc;
01107 
01108       *pOut = (q15_t) (acc1 >> 15);
01109       pOut += inc;
01110 
01111       *pOut = (q15_t) (acc2 >> 15);
01112       pOut += inc;
01113 
01114       *pOut = (q15_t) (acc3 >> 15);
01115       pOut += inc;
01116 
01117       /* Increment the pointer pIn1 index, count by 1 */
01118       count += 4U;
01119 
01120       /* Update the inputA and inputB pointers for next MAC calculation */
01121       px = pIn1 + count;
01122       py = pIn2;
01123 
01124 
01125       /* Decrement the loop counter */
01126       blkCnt--;
01127     }
01128 
01129     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
01130      ** No loop unrolling is used. */
01131     blkCnt = blockSize2 % 0x4U;
01132 
01133     while (blkCnt > 0U)
01134     {
01135       /* Accumulator is made zero for every iteration */
01136       sum = 0;
01137 
01138       /* Apply loop unrolling and compute 4 MACs simultaneously. */
01139       k = srcBLen >> 2U;
01140 
01141       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
01142        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
01143       while (k > 0U)
01144       {
01145         /* Perform the multiply-accumulates */
01146         sum += ((q31_t) * px++ * *py++);
01147         sum += ((q31_t) * px++ * *py++);
01148         sum += ((q31_t) * px++ * *py++);
01149         sum += ((q31_t) * px++ * *py++);
01150 
01151         /* Decrement the loop counter */
01152         k--;
01153       }
01154 
01155       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
01156        ** No loop unrolling is used. */
01157       k = srcBLen % 0x4U;
01158 
01159       while (k > 0U)
01160       {
01161         /* Perform the multiply-accumulates */
01162         sum += ((q31_t) * px++ * *py++);
01163 
01164         /* Decrement the loop counter */
01165         k--;
01166       }
01167 
01168       /* Store the result in the accumulator in the destination buffer. */
01169       *pOut = (q15_t) (sum >> 15);
01170       /* Destination pointer is updated according to the address modifier, inc */
01171       pOut += inc;
01172 
01173       /* Increment the pointer pIn1 index, count by 1 */
01174       count++;
01175 
01176       /* Update the inputA and inputB pointers for next MAC calculation */
01177       px = pIn1 + count;
01178       py = pIn2;
01179 
01180       /* Decrement the loop counter */
01181       blkCnt--;
01182     }
01183   }
01184   else
01185   {
01186     /* If the srcBLen is not a multiple of 4,
01187      * the blockSize2 loop cannot be unrolled by 4 */
01188     blkCnt = blockSize2;
01189 
01190     while (blkCnt > 0U)
01191     {
01192       /* Accumulator is made zero for every iteration */
01193       sum = 0;
01194 
01195       /* Loop over srcBLen */
01196       k = srcBLen;
01197 
01198       while (k > 0U)
01199       {
01200         /* Perform the multiply-accumulate */
01201         sum += ((q31_t) * px++ * *py++);
01202 
01203         /* Decrement the loop counter */
01204         k--;
01205       }
01206 
01207       /* Store the result in the accumulator in the destination buffer. */
01208       *pOut = (q15_t) (sum >> 15);
01209       /* Destination pointer is updated according to the address modifier, inc */
01210       pOut += inc;
01211 
01212       /* Increment the MAC count */
01213       count++;
01214 
01215       /* Update the inputA and inputB pointers for next MAC calculation */
01216       px = pIn1 + count;
01217       py = pIn2;
01218 
01219       /* Decrement the loop counter */
01220       blkCnt--;
01221     }
01222   }
01223 
01224   /* --------------------------
01225    * Initializations of stage3
01226    * -------------------------*/
01227 
01228   /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
01229    * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
01230    * ....
01231    * sum +=  x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
01232    * sum +=  x[srcALen-1] * y[0]
01233    */
01234 
01235   /* In this stage the MAC operations are decreased by 1 for every iteration.
01236      The count variable holds the number of MAC operations performed */
01237   count = srcBLen - 1U;
01238 
01239   /* Working pointer of inputA */
01240   pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
01241   px = pSrc1;
01242 
01243   /* Working pointer of inputB */
01244   py = pIn2;
01245 
01246   /* -------------------
01247    * Stage3 process
01248    * ------------------*/
01249 
01250   while (blockSize3 > 0U)
01251   {
01252     /* Accumulator is made zero for every iteration */
01253     sum = 0;
01254 
01255     /* Apply loop unrolling and compute 4 MACs simultaneously. */
01256     k = count >> 2U;
01257 
01258     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
01259      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
01260     while (k > 0U)
01261     {
01262       /* Perform the multiply-accumulates */
01263         sum += ((q31_t) * px++ * *py++);
01264         sum += ((q31_t) * px++ * *py++);
01265         sum += ((q31_t) * px++ * *py++);
01266         sum += ((q31_t) * px++ * *py++);
01267 
01268       /* Decrement the loop counter */
01269       k--;
01270     }
01271 
01272     /* If the count is not a multiple of 4, compute any remaining MACs here.
01273      ** No loop unrolling is used. */
01274     k = count % 0x4U;
01275 
01276     while (k > 0U)
01277     {
01278       /* Perform the multiply-accumulates */
01279         sum += ((q31_t) * px++ * *py++);
01280 
01281       /* Decrement the loop counter */
01282       k--;
01283     }
01284 
01285     /* Store the result in the accumulator in the destination buffer. */
01286     *pOut = (q15_t) (sum >> 15);
01287     /* Destination pointer is updated according to the address modifier, inc */
01288     pOut += inc;
01289 
01290     /* Update the inputA and inputB pointers for next MAC calculation */
01291     px = ++pSrc1;
01292     py = pIn2;
01293 
01294     /* Decrement the MAC count */
01295     count--;
01296 
01297     /* Decrement the loop counter */
01298     blockSize3--;
01299   }
01300 
01301 #endif /*   #ifndef UNALIGNED_SUPPORT_DISABLE */
01302 
01303 }
01304 
01305 /**
01306  * @} end of Corr group
01307  */
01308