CMSIS DSP library

Dependents:   KL25Z_FFT_Demo Hat_Board_v5_1 KL25Z_FFT_Demo_tony KL25Z_FFT_Demo_tony ... more

Fork of mbed-dsp by mbed official

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_correlate_fast_q31.c Source File

arm_correlate_fast_q31.c

00001 /* ----------------------------------------------------------------------    
00002 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.    
00003 *    
00004 * $Date:        17. January 2013
00005 * $Revision:    V1.4.1
00006 *    
00007 * Project:      CMSIS DSP Library    
00008 * Title:        arm_correlate_fast_q31.c    
00009 *    
00010 * Description:  Fast Q31 Correlation.    
00011 *    
00012 * Target Processor: Cortex-M4/Cortex-M3
00013 *
00014 * Redistribution and use in source and binary forms, with or without 
00015 * modification, are permitted provided that the following conditions
00016 * are met:
00017 *   - Redistributions of source code must retain the above copyright
00018 *     notice, this list of conditions and the following disclaimer.
00019 *   - Redistributions in binary form must reproduce the above copyright
00020 *     notice, this list of conditions and the following disclaimer in
00021 *     the documentation and/or other materials provided with the 
00022 *     distribution.
00023 *   - Neither the name of ARM LIMITED nor the names of its contributors
00024 *     may be used to endorse or promote products derived from this
00025 *     software without specific prior written permission.
00026 *
00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00038 * POSSIBILITY OF SUCH DAMAGE.  
00039 * -------------------------------------------------------------------- */
00040 
00041 #include "arm_math.h"
00042 
00043 /**    
00044  * @ingroup groupFilters    
00045  */
00046 
00047 /**    
00048  * @addtogroup Corr    
00049  * @{    
00050  */
00051 
00052 /**    
00053  * @brief Correlation of Q31 sequences (fast version) for Cortex-M3 and Cortex-M4.    
00054  * @param[in] *pSrcA points to the first input sequence.    
00055  * @param[in] srcALen length of the first input sequence.    
00056  * @param[in] *pSrcB points to the second input sequence.    
00057  * @param[in] srcBLen length of the second input sequence.    
00058  * @param[out] *pDst points to the location where the output result is written.  Length 2 * max(srcALen, srcBLen) - 1.    
00059  * @return none.    
00060  *    
00061  * @details    
00062  * <b>Scaling and Overflow Behavior:</b>    
00063  *    
00064  * \par    
00065  * This function is optimized for speed at the expense of fixed-point precision and overflow protection.    
00066  * The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format.    
00067  * These intermediate results are accumulated in a 32-bit register in 2.30 format.    
00068  * Finally, the accumulator is saturated and converted to a 1.31 result.    
00069  *    
00070  * \par    
00071  * The fast version has the same overflow behavior as the standard version but provides less precision since it discards the low 32 bits of each multiplication result.    
00072  * In order to avoid overflows completely the input signals must be scaled down.    
00073  * The input signals should be scaled down to avoid intermediate overflows.    
00074  * Scale down one of the inputs by 1/min(srcALen, srcBLen)to avoid overflows since a    
00075  * maximum of min(srcALen, srcBLen) number of additions is carried internally.    
00076  *    
00077  * \par    
00078  * See <code>arm_correlate_q31()</code> for a slower implementation of this function which uses 64-bit accumulation to provide higher precision.    
00079  */
00080 
00081 void arm_correlate_fast_q31(
00082   q31_t * pSrcA,
00083   uint32_t srcALen,
00084   q31_t * pSrcB,
00085   uint32_t srcBLen,
00086   q31_t * pDst)
00087 {
00088   q31_t *pIn1;                                   /* inputA pointer               */
00089   q31_t *pIn2;                                   /* inputB pointer               */
00090   q31_t *pOut = pDst;                            /* output pointer               */
00091   q31_t *px;                                     /* Intermediate inputA pointer  */
00092   q31_t *py;                                     /* Intermediate inputB pointer  */
00093   q31_t *pSrc1;                                  /* Intermediate pointers        */
00094   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulators                  */
00095   q31_t x0, x1, x2, x3, c0;                      /* temporary variables for holding input and coefficient values */
00096   uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3;  /* loop counter                 */
00097   int32_t inc = 1;                               /* Destination address modifier */
00098 
00099 
00100   /* The algorithm implementation is based on the lengths of the inputs. */
00101   /* srcB is always made to slide across srcA. */
00102   /* So srcBLen is always considered as shorter or equal to srcALen */
00103   if(srcALen >= srcBLen)
00104   {
00105     /* Initialization of inputA pointer */
00106     pIn1 = (pSrcA);
00107 
00108     /* Initialization of inputB pointer */
00109     pIn2 = (pSrcB);
00110 
00111     /* Number of output samples is calculated */
00112     outBlockSize = (2u * srcALen) - 1u;
00113 
00114     /* When srcALen > srcBLen, zero padding is done to srcB    
00115      * to make their lengths equal.    
00116      * Instead, (outBlockSize - (srcALen + srcBLen - 1))    
00117      * number of output samples are made zero */
00118     j = outBlockSize - (srcALen + (srcBLen - 1u));
00119 
00120     /* Updating the pointer position to non zero value */
00121     pOut += j;
00122 
00123   }
00124   else
00125   {
00126     /* Initialization of inputA pointer */
00127     pIn1 = (pSrcB);
00128 
00129     /* Initialization of inputB pointer */
00130     pIn2 = (pSrcA);
00131 
00132     /* srcBLen is always considered as shorter or equal to srcALen */
00133     j = srcBLen;
00134     srcBLen = srcALen;
00135     srcALen = j;
00136 
00137     /* CORR(x, y) = Reverse order(CORR(y, x)) */
00138     /* Hence set the destination pointer to point to the last output sample */
00139     pOut = pDst + ((srcALen + srcBLen) - 2u);
00140 
00141     /* Destination address modifier is set to -1 */
00142     inc = -1;
00143 
00144   }
00145 
00146   /* The function is internally    
00147    * divided into three parts according to the number of multiplications that has to be    
00148    * taken place between inputA samples and inputB samples. In the first part of the    
00149    * algorithm, the multiplications increase by one for every iteration.    
00150    * In the second part of the algorithm, srcBLen number of multiplications are done.    
00151    * In the third part of the algorithm, the multiplications decrease by one    
00152    * for every iteration.*/
00153   /* The algorithm is implemented in three stages.    
00154    * The loop counters of each stage is initiated here. */
00155   blockSize1 = srcBLen - 1u;
00156   blockSize2 = srcALen - (srcBLen - 1u);
00157   blockSize3 = blockSize1;
00158 
00159   /* --------------------------    
00160    * Initializations of stage1    
00161    * -------------------------*/
00162 
00163   /* sum = x[0] * y[srcBlen - 1]    
00164    * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]    
00165    * ....    
00166    * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]    
00167    */
00168 
00169   /* In this stage the MAC operations are increased by 1 for every iteration.    
00170      The count variable holds the number of MAC operations performed */
00171   count = 1u;
00172 
00173   /* Working pointer of inputA */
00174   px = pIn1;
00175 
00176   /* Working pointer of inputB */
00177   pSrc1 = pIn2 + (srcBLen - 1u);
00178   py = pSrc1;
00179 
00180   /* ------------------------    
00181    * Stage1 process    
00182    * ----------------------*/
00183 
00184   /* The first stage starts here */
00185   while(blockSize1 > 0u)
00186   {
00187     /* Accumulator is made zero for every iteration */
00188     sum = 0;
00189 
00190     /* Apply loop unrolling and compute 4 MACs simultaneously. */
00191     k = count >> 2;
00192 
00193     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.    
00194      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00195     while(k > 0u)
00196     {
00197       /* x[0] * y[srcBLen - 4] */
00198       sum = (q31_t) ((((q63_t) sum << 32) +
00199                       ((q63_t) * px++ * (*py++))) >> 32);
00200       /* x[1] * y[srcBLen - 3] */
00201       sum = (q31_t) ((((q63_t) sum << 32) +
00202                       ((q63_t) * px++ * (*py++))) >> 32);
00203       /* x[2] * y[srcBLen - 2] */
00204       sum = (q31_t) ((((q63_t) sum << 32) +
00205                       ((q63_t) * px++ * (*py++))) >> 32);
00206       /* x[3] * y[srcBLen - 1] */
00207       sum = (q31_t) ((((q63_t) sum << 32) +
00208                       ((q63_t) * px++ * (*py++))) >> 32);
00209 
00210       /* Decrement the loop counter */
00211       k--;
00212     }
00213 
00214     /* If the count is not a multiple of 4, compute any remaining MACs here.    
00215      ** No loop unrolling is used. */
00216     k = count % 0x4u;
00217 
00218     while(k > 0u)
00219     {
00220       /* Perform the multiply-accumulates */
00221       /* x[0] * y[srcBLen - 1] */
00222       sum = (q31_t) ((((q63_t) sum << 32) +
00223                       ((q63_t) * px++ * (*py++))) >> 32);
00224 
00225       /* Decrement the loop counter */
00226       k--;
00227     }
00228 
00229     /* Store the result in the accumulator in the destination buffer. */
00230     *pOut = sum << 1;
00231     /* Destination pointer is updated according to the address modifier, inc */
00232     pOut += inc;
00233 
00234     /* Update the inputA and inputB pointers for next MAC calculation */
00235     py = pSrc1 - count;
00236     px = pIn1;
00237 
00238     /* Increment the MAC count */
00239     count++;
00240 
00241     /* Decrement the loop counter */
00242     blockSize1--;
00243   }
00244 
00245   /* --------------------------    
00246    * Initializations of stage2    
00247    * ------------------------*/
00248 
00249   /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]    
00250    * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]    
00251    * ....    
00252    * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]    
00253    */
00254 
00255   /* Working pointer of inputA */
00256   px = pIn1;
00257 
00258   /* Working pointer of inputB */
00259   py = pIn2;
00260 
00261   /* count is index by which the pointer pIn1 to be incremented */
00262   count = 0u;
00263 
00264   /* -------------------    
00265    * Stage2 process    
00266    * ------------------*/
00267 
00268   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.    
00269    * So, to loop unroll over blockSize2,    
00270    * srcBLen should be greater than or equal to 4 */
00271   if(srcBLen >= 4u)
00272   {
00273     /* Loop unroll over blockSize2, by 4 */
00274     blkCnt = blockSize2 >> 2u;
00275 
00276     while(blkCnt > 0u)
00277     {
00278       /* Set all accumulators to zero */
00279       acc0 = 0;
00280       acc1 = 0;
00281       acc2 = 0;
00282       acc3 = 0;
00283 
00284       /* read x[0], x[1], x[2] samples */
00285       x0 = *(px++);
00286       x1 = *(px++);
00287       x2 = *(px++);
00288 
00289       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00290       k = srcBLen >> 2u;
00291 
00292       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.    
00293        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00294       do
00295       {
00296         /* Read y[0] sample */
00297         c0 = *(py++);
00298 
00299         /* Read x[3] sample */
00300         x3 = *(px++);
00301 
00302         /* Perform the multiply-accumulate */
00303         /* acc0 +=  x[0] * y[0] */
00304         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
00305         /* acc1 +=  x[1] * y[0] */
00306         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
00307         /* acc2 +=  x[2] * y[0] */
00308         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
00309         /* acc3 +=  x[3] * y[0] */
00310         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
00311 
00312         /* Read y[1] sample */
00313         c0 = *(py++);
00314 
00315         /* Read x[4] sample */
00316         x0 = *(px++);
00317 
00318         /* Perform the multiply-accumulates */
00319         /* acc0 +=  x[1] * y[1] */
00320         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32);
00321         /* acc1 +=  x[2] * y[1] */
00322         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32);
00323         /* acc2 +=  x[3] * y[1] */
00324         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32);
00325         /* acc3 +=  x[4] * y[1] */
00326         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32);
00327 
00328         /* Read y[2] sample */
00329         c0 = *(py++);
00330 
00331         /* Read x[5] sample */
00332         x1 = *(px++);
00333 
00334         /* Perform the multiply-accumulates */
00335         /* acc0 +=  x[2] * y[2] */
00336         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32);
00337         /* acc1 +=  x[3] * y[2] */
00338         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32);
00339         /* acc2 +=  x[4] * y[2] */
00340         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32);
00341         /* acc3 +=  x[5] * y[2] */
00342         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32);
00343 
00344         /* Read y[3] sample */
00345         c0 = *(py++);
00346 
00347         /* Read x[6] sample */
00348         x2 = *(px++);
00349 
00350         /* Perform the multiply-accumulates */
00351         /* acc0 +=  x[3] * y[3] */
00352         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32);
00353         /* acc1 +=  x[4] * y[3] */
00354         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32);
00355         /* acc2 +=  x[5] * y[3] */
00356         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32);
00357         /* acc3 +=  x[6] * y[3] */
00358         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32);
00359 
00360 
00361       } while(--k);
00362 
00363       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.    
00364        ** No loop unrolling is used. */
00365       k = srcBLen % 0x4u;
00366 
00367       while(k > 0u)
00368       {
00369         /* Read y[4] sample */
00370         c0 = *(py++);
00371 
00372         /* Read x[7] sample */
00373         x3 = *(px++);
00374 
00375         /* Perform the multiply-accumulates */
00376         /* acc0 +=  x[4] * y[4] */
00377         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
00378         /* acc1 +=  x[5] * y[4] */
00379         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
00380         /* acc2 +=  x[6] * y[4] */
00381         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
00382         /* acc3 +=  x[7] * y[4] */
00383         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
00384 
00385         /* Reuse the present samples for the next MAC */
00386         x0 = x1;
00387         x1 = x2;
00388         x2 = x3;
00389 
00390         /* Decrement the loop counter */
00391         k--;
00392       }
00393 
00394       /* Store the result in the accumulator in the destination buffer. */
00395       *pOut = (q31_t) (acc0 << 1);
00396       /* Destination pointer is updated according to the address modifier, inc */
00397       pOut += inc;
00398 
00399       *pOut = (q31_t) (acc1 << 1);
00400       pOut += inc;
00401 
00402       *pOut = (q31_t) (acc2 << 1);
00403       pOut += inc;
00404 
00405       *pOut = (q31_t) (acc3 << 1);
00406       pOut += inc;
00407 
00408       /* Increment the pointer pIn1 index, count by 4 */
00409       count += 4u;
00410 
00411       /* Update the inputA and inputB pointers for next MAC calculation */
00412       px = pIn1 + count;
00413       py = pIn2;
00414 
00415 
00416       /* Decrement the loop counter */
00417       blkCnt--;
00418     }
00419 
00420     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.    
00421      ** No loop unrolling is used. */
00422     blkCnt = blockSize2 % 0x4u;
00423 
00424     while(blkCnt > 0u)
00425     {
00426       /* Accumulator is made zero for every iteration */
00427       sum = 0;
00428 
00429       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00430       k = srcBLen >> 2u;
00431 
00432       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.    
00433        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00434       while(k > 0u)
00435       {
00436         /* Perform the multiply-accumulates */
00437         sum = (q31_t) ((((q63_t) sum << 32) +
00438                         ((q63_t) * px++ * (*py++))) >> 32);
00439         sum = (q31_t) ((((q63_t) sum << 32) +
00440                         ((q63_t) * px++ * (*py++))) >> 32);
00441         sum = (q31_t) ((((q63_t) sum << 32) +
00442                         ((q63_t) * px++ * (*py++))) >> 32);
00443         sum = (q31_t) ((((q63_t) sum << 32) +
00444                         ((q63_t) * px++ * (*py++))) >> 32);
00445 
00446         /* Decrement the loop counter */
00447         k--;
00448       }
00449 
00450       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.    
00451        ** No loop unrolling is used. */
00452       k = srcBLen % 0x4u;
00453 
00454       while(k > 0u)
00455       {
00456         /* Perform the multiply-accumulate */
00457         sum = (q31_t) ((((q63_t) sum << 32) +
00458                         ((q63_t) * px++ * (*py++))) >> 32);
00459 
00460         /* Decrement the loop counter */
00461         k--;
00462       }
00463 
00464       /* Store the result in the accumulator in the destination buffer. */
00465       *pOut = sum << 1;
00466       /* Destination pointer is updated according to the address modifier, inc */
00467       pOut += inc;
00468 
00469       /* Increment the MAC count */
00470       count++;
00471 
00472       /* Update the inputA and inputB pointers for next MAC calculation */
00473       px = pIn1 + count;
00474       py = pIn2;
00475 
00476 
00477       /* Decrement the loop counter */
00478       blkCnt--;
00479     }
00480   }
00481   else
00482   {
00483     /* If the srcBLen is not a multiple of 4,    
00484      * the blockSize2 loop cannot be unrolled by 4 */
00485     blkCnt = blockSize2;
00486 
00487     while(blkCnt > 0u)
00488     {
00489       /* Accumulator is made zero for every iteration */
00490       sum = 0;
00491 
00492       /* Loop over srcBLen */
00493       k = srcBLen;
00494 
00495       while(k > 0u)
00496       {
00497         /* Perform the multiply-accumulate */
00498         sum = (q31_t) ((((q63_t) sum << 32) +
00499                         ((q63_t) * px++ * (*py++))) >> 32);
00500 
00501         /* Decrement the loop counter */
00502         k--;
00503       }
00504 
00505       /* Store the result in the accumulator in the destination buffer. */
00506       *pOut = sum << 1;
00507       /* Destination pointer is updated according to the address modifier, inc */
00508       pOut += inc;
00509 
00510       /* Increment the MAC count */
00511       count++;
00512 
00513       /* Update the inputA and inputB pointers for next MAC calculation */
00514       px = pIn1 + count;
00515       py = pIn2;
00516 
00517       /* Decrement the loop counter */
00518       blkCnt--;
00519     }
00520   }
00521 
00522   /* --------------------------    
00523    * Initializations of stage3    
00524    * -------------------------*/
00525 
00526   /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]    
00527    * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]    
00528    * ....    
00529    * sum +=  x[srcALen-2] * y[0] + x[srcALen-1] * y[1]    
00530    * sum +=  x[srcALen-1] * y[0]    
00531    */
00532 
00533   /* In this stage the MAC operations are decreased by 1 for every iteration.    
00534      The count variable holds the number of MAC operations performed */
00535   count = srcBLen - 1u;
00536 
00537   /* Working pointer of inputA */
00538   pSrc1 = ((pIn1 + srcALen) - srcBLen) + 1u;
00539   px = pSrc1;
00540 
00541   /* Working pointer of inputB */
00542   py = pIn2;
00543 
00544   /* -------------------    
00545    * Stage3 process    
00546    * ------------------*/
00547 
00548   while(blockSize3 > 0u)
00549   {
00550     /* Accumulator is made zero for every iteration */
00551     sum = 0;
00552 
00553     /* Apply loop unrolling and compute 4 MACs simultaneously. */
00554     k = count >> 2u;
00555 
00556     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.    
00557      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00558     while(k > 0u)
00559     {
00560       /* Perform the multiply-accumulates */
00561       /* sum += x[srcALen - srcBLen + 4] * y[3] */
00562       sum = (q31_t) ((((q63_t) sum << 32) +
00563                       ((q63_t) * px++ * (*py++))) >> 32);
00564       /* sum += x[srcALen - srcBLen + 3] * y[2] */
00565       sum = (q31_t) ((((q63_t) sum << 32) +
00566                       ((q63_t) * px++ * (*py++))) >> 32);
00567       /* sum += x[srcALen - srcBLen + 2] * y[1] */
00568       sum = (q31_t) ((((q63_t) sum << 32) +
00569                       ((q63_t) * px++ * (*py++))) >> 32);
00570       /* sum += x[srcALen - srcBLen + 1] * y[0] */
00571       sum = (q31_t) ((((q63_t) sum << 32) +
00572                       ((q63_t) * px++ * (*py++))) >> 32);
00573 
00574       /* Decrement the loop counter */
00575       k--;
00576     }
00577 
00578     /* If the count is not a multiple of 4, compute any remaining MACs here.    
00579      ** No loop unrolling is used. */
00580     k = count % 0x4u;
00581 
00582     while(k > 0u)
00583     {
00584       /* Perform the multiply-accumulates */
00585       sum = (q31_t) ((((q63_t) sum << 32) +
00586                       ((q63_t) * px++ * (*py++))) >> 32);
00587 
00588       /* Decrement the loop counter */
00589       k--;
00590     }
00591 
00592     /* Store the result in the accumulator in the destination buffer. */
00593     *pOut = sum << 1;
00594     /* Destination pointer is updated according to the address modifier, inc */
00595     pOut += inc;
00596 
00597     /* Update the inputA and inputB pointers for next MAC calculation */
00598     px = ++pSrc1;
00599     py = pIn2;
00600 
00601     /* Decrement the MAC count */
00602     count--;
00603 
00604     /* Decrement the loop counter */
00605     blockSize3--;
00606   }
00607 
00608 }
00609 
00610 /**    
00611  * @} end of Corr group    
00612  */