CMSIS DSP Library from CMSIS 2.0. See http://www.onarm.com/cmsis/ for full details

Dependents:   K22F_DSP_Matrix_least_square BNO055-ELEC3810 1BNO055 ECE4180Project--Slave2 ... more

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_correlate_fast_q31.c Source File

arm_correlate_fast_q31.c

00001 /* ----------------------------------------------------------------------  
00002 * Copyright (C) 2010 ARM Limited. All rights reserved.  
00003 *  
00004 * $Date:        29. November 2010  
00005 * $Revision:    V1.0.3  
00006 *  
00007 * Project:      CMSIS DSP Library  
00008 * Title:        arm_correlate_fast_q31.c  
00009 *  
00010 * Description:  Fast Q31 Correlation.  
00011 *  
00012 * Target Processor: Cortex-M4/Cortex-M3
00013 *  
00014 * Version 1.0.3 2010/11/29 
00015 *    Re-organized the CMSIS folders and updated documentation.  
00016 *   
00017 * Version 1.0.2 2010/11/11  
00018 *    Documentation updated.   
00019 *  
00020 * Version 1.0.1 2010/10/05   
00021 *    Production release and review comments incorporated.  
00022 *  
00023 * Version 1.0.0 2010/09/20   
00024 *    Production release and review comments incorporated.  
00025 * -------------------------------------------------------------------- */ 
00026  
00027 #include "arm_math.h" 
00028  
00029 /**  
00030  * @ingroup groupFilters  
00031  */ 
00032  
00033 /**  
00034  * @addtogroup Corr  
00035  * @{  
00036  */ 
00037  
00038 /**  
00039  * @brief Correlate Q31 sequences (fast version)  
00040  * @param[in] *pSrcA points to the first input sequence.  
00041  * @param[in] srcALen length of the first input sequence.  
00042  * @param[in] *pSrcB points to the second input sequence.  
00043  * @param[in] srcBLen length of the second input sequence.  
00044  * @param[out] *pDst points to the location where the output result is written.  Length 2 * max(srcALen, srcBLen) - 1.  
00045  * @return none.  
00046  *  
00047  * @details  
00048  * <b>Scaling and Overflow Behavior:</b>  
00049  *  
00050  * \par  
00051  * This function is optimized for speed at the expense of fixed-point precision and overflow protection.  
00052  * The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format.  
00053  * These intermediate results are accumulated in a 32-bit register in 2.30 format.  
00054  * Finally, the accumulator is saturated and converted to a 1.31 result.  
00055  *  
00056  * \par  
00057  * The fast version has the same overflow behavior as the standard version but provides less precision since it discards the low 32 bits of each multiplication result.  
00058  * In order to avoid overflows completely the input signals must be scaled down.  
00059  * The input signals should be scaled down to avoid intermediate overflows.  
00060  * Scale down one of the inputs by 1/min(srcALen, srcBLen)to avoid overflows since a  
00061  * maximum of min(srcALen, srcBLen) number of additions is carried internally.  
00062  *  
00063  * \par  
00064  * See <code>arm_correlate_q31()</code> for a slower implementation of this function which uses 64-bit accumulation to provide higher precision.  
00065  */ 
00066  
00067 void arm_correlate_fast_q31( 
00068   q31_t * pSrcA, 
00069   uint32_t srcALen, 
00070   q31_t * pSrcB, 
00071   uint32_t srcBLen, 
00072   q31_t * pDst) 
00073 { 
00074   q31_t *pIn1;                                   /* inputA pointer               */ 
00075   q31_t *pIn2;                                   /* inputB pointer               */ 
00076   q31_t *pOut = pDst;                            /* output pointer               */ 
00077   q31_t *px;                                     /* Intermediate inputA pointer  */ 
00078   q31_t *py;                                     /* Intermediate inputB pointer  */ 
00079   q31_t *pSrc1;                                  /* Intermediate pointers        */ 
00080   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulators                  */ 
00081   q31_t x0, x1, x2, x3, c0;                      /* temporary variables for holding input and coefficient values */ 
00082   uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3;  /* loop counter                 */ 
00083   int32_t inc = 1;                               /* Destination address modifier */ 
00084  
00085  
00086   /* The algorithm implementation is based on the lengths of the inputs. */ 
00087   /* srcB is always made to slide across srcA. */ 
00088   /* So srcBLen is always considered as shorter or equal to srcALen */ 
00089   if(srcALen >= srcBLen) 
00090   { 
00091     /* Initialization of inputA pointer */ 
00092     pIn1 = (pSrcA); 
00093  
00094     /* Initialization of inputB pointer */ 
00095     pIn2 = (pSrcB); 
00096  
00097     /* Number of output samples is calculated */ 
00098     outBlockSize = (2u * srcALen) - 1u; 
00099  
00100     /* When srcALen > srcBLen, zero padding is done to srcB  
00101      * to make their lengths equal.  
00102      * Instead, (outBlockSize - (srcALen + srcBLen - 1))  
00103      * number of output samples are made zero */ 
00104     j = outBlockSize - (srcALen + (srcBLen - 1u)); 
00105  
00106     while(j > 0u) 
00107     { 
00108       /* Zero is stored in the destination buffer */ 
00109       *pOut++ = 0; 
00110  
00111       /* Decrement the loop counter */ 
00112       j--; 
00113     } 
00114  
00115   } 
00116   else 
00117   { 
00118     /* Initialization of inputA pointer */ 
00119     pIn1 = (pSrcB); 
00120  
00121     /* Initialization of inputB pointer */ 
00122     pIn2 = (pSrcA); 
00123  
00124     /* srcBLen is always considered as shorter or equal to srcALen */ 
00125     j = srcBLen; 
00126     srcBLen = srcALen; 
00127     srcALen = j; 
00128  
00129     /* CORR(x, y) = Reverse order(CORR(y, x)) */ 
00130     /* Hence set the destination pointer to point to the last output sample */ 
00131     pOut = pDst + ((srcALen + srcBLen) - 2u); 
00132  
00133     /* Destination address modifier is set to -1 */ 
00134     inc = -1; 
00135  
00136   } 
00137  
00138   /* The function is internally  
00139    * divided into three parts according to the number of multiplications that has to be  
00140    * taken place between inputA samples and inputB samples. In the first part of the  
00141    * algorithm, the multiplications increase by one for every iteration.  
00142    * In the second part of the algorithm, srcBLen number of multiplications are done.  
00143    * In the third part of the algorithm, the multiplications decrease by one  
00144    * for every iteration.*/ 
00145   /* The algorithm is implemented in three stages.  
00146    * The loop counters of each stage is initiated here. */ 
00147   blockSize1 = srcBLen - 1u; 
00148   blockSize2 = srcALen - (srcBLen - 1u); 
00149   blockSize3 = blockSize1; 
00150  
00151   /* --------------------------  
00152    * Initializations of stage1  
00153    * -------------------------*/ 
00154  
00155   /* sum = x[0] * y[srcBlen - 1]  
00156    * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]  
00157    * ....  
00158    * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]  
00159    */ 
00160  
00161   /* In this stage the MAC operations are increased by 1 for every iteration.  
00162      The count variable holds the number of MAC operations performed */ 
00163   count = 1u; 
00164  
00165   /* Working pointer of inputA */ 
00166   px = pIn1; 
00167  
00168   /* Working pointer of inputB */ 
00169   pSrc1 = pIn2 + (srcBLen - 1u); 
00170   py = pSrc1; 
00171  
00172   /* ------------------------  
00173    * Stage1 process  
00174    * ----------------------*/ 
00175  
00176   /* The first stage starts here */ 
00177   while(blockSize1 > 0u) 
00178   { 
00179     /* Accumulator is made zero for every iteration */ 
00180     sum = 0; 
00181  
00182     /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00183     k = count >> 2; 
00184  
00185     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00186      ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00187     while(k > 0u) 
00188     { 
00189       /* x[0] * y[srcBLen - 4] */ 
00190       sum = (q31_t) ((((q63_t) sum << 32) + 
00191                       ((q63_t) * px++ * (*py++))) >> 32); 
00192       /* x[1] * y[srcBLen - 3] */ 
00193       sum = (q31_t) ((((q63_t) sum << 32) + 
00194                       ((q63_t) * px++ * (*py++))) >> 32); 
00195       /* x[2] * y[srcBLen - 2] */ 
00196       sum = (q31_t) ((((q63_t) sum << 32) + 
00197                       ((q63_t) * px++ * (*py++))) >> 32); 
00198       /* x[3] * y[srcBLen - 1] */ 
00199       sum = (q31_t) ((((q63_t) sum << 32) + 
00200                       ((q63_t) * px++ * (*py++))) >> 32); 
00201  
00202       /* Decrement the loop counter */ 
00203       k--; 
00204     } 
00205  
00206     /* If the count is not a multiple of 4, compute any remaining MACs here.  
00207      ** No loop unrolling is used. */ 
00208     k = count % 0x4u; 
00209  
00210     while(k > 0u) 
00211     { 
00212       /* Perform the multiply-accumulates */ 
00213       /* x[0] * y[srcBLen - 1] */ 
00214       sum = (q31_t) ((((q63_t) sum << 32) + 
00215                       ((q63_t) * px++ * (*py++))) >> 32); 
00216  
00217       /* Decrement the loop counter */ 
00218       k--; 
00219     } 
00220  
00221     /* Store the result in the accumulator in the destination buffer. */ 
00222     *pOut = sum << 1; 
00223     /* Destination pointer is updated according to the address modifier, inc */ 
00224     pOut += inc; 
00225  
00226     /* Update the inputA and inputB pointers for next MAC calculation */ 
00227     py = pSrc1 - count; 
00228     px = pIn1; 
00229  
00230     /* Increment the MAC count */ 
00231     count++; 
00232  
00233     /* Decrement the loop counter */ 
00234     blockSize1--; 
00235   } 
00236  
00237   /* --------------------------  
00238    * Initializations of stage2  
00239    * ------------------------*/ 
00240  
00241   /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]  
00242    * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]  
00243    * ....  
00244    * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]  
00245    */ 
00246  
00247   /* Working pointer of inputA */ 
00248   px = pIn1; 
00249  
00250   /* Working pointer of inputB */ 
00251   py = pIn2; 
00252  
00253   /* count is index by which the pointer pIn1 to be incremented */ 
00254   count = 1u; 
00255  
00256   /* -------------------  
00257    * Stage2 process  
00258    * ------------------*/ 
00259  
00260   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.  
00261    * So, to loop unroll over blockSize2,  
00262    * srcBLen should be greater than or equal to 4 */ 
00263   if(srcBLen >= 4u) 
00264   { 
00265     /* Loop unroll over blockSize2, by 4 */ 
00266     blkCnt = blockSize2 >> 2u; 
00267  
00268     while(blkCnt > 0u) 
00269     { 
00270       /* Set all accumulators to zero */ 
00271       acc0 = 0; 
00272       acc1 = 0; 
00273       acc2 = 0; 
00274       acc3 = 0; 
00275  
00276       /* read x[0], x[1], x[2] samples */ 
00277       x0 = *(px++); 
00278       x1 = *(px++); 
00279       x2 = *(px++); 
00280  
00281       /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00282       k = srcBLen >> 2u; 
00283  
00284       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00285        ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00286       do 
00287       { 
00288         /* Read y[0] sample */ 
00289         c0 = *(py++); 
00290  
00291         /* Read x[3] sample */ 
00292         x3 = *(px++); 
00293  
00294         /* Perform the multiply-accumulate */ 
00295         /* acc0 +=  x[0] * y[0] */ 
00296         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32); 
00297         /* acc1 +=  x[1] * y[0] */ 
00298         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32); 
00299         /* acc2 +=  x[2] * y[0] */ 
00300         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32); 
00301         /* acc3 +=  x[3] * y[0] */ 
00302         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32); 
00303  
00304         /* Read y[1] sample */ 
00305         c0 = *(py++); 
00306  
00307         /* Read x[4] sample */ 
00308         x0 = *(px++); 
00309  
00310         /* Perform the multiply-accumulates */ 
00311         /* acc0 +=  x[1] * y[1] */ 
00312         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32); 
00313         /* acc1 +=  x[2] * y[1] */ 
00314         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32); 
00315         /* acc2 +=  x[3] * y[1] */ 
00316         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32); 
00317         /* acc3 +=  x[4] * y[1] */ 
00318         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32); 
00319  
00320         /* Read y[2] sample */ 
00321         c0 = *(py++); 
00322  
00323         /* Read x[5] sample */ 
00324         x1 = *(px++); 
00325  
00326         /* Perform the multiply-accumulates */ 
00327         /* acc0 +=  x[2] * y[2] */ 
00328         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32); 
00329         /* acc1 +=  x[3] * y[2] */ 
00330         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32); 
00331         /* acc2 +=  x[4] * y[2] */ 
00332         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32); 
00333         /* acc3 +=  x[5] * y[2] */ 
00334         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32); 
00335  
00336         /* Read y[3] sample */ 
00337         c0 = *(py++); 
00338  
00339         /* Read x[6] sample */ 
00340         x2 = *(px++); 
00341  
00342         /* Perform the multiply-accumulates */ 
00343         /* acc0 +=  x[3] * y[3] */ 
00344         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32); 
00345         /* acc1 +=  x[4] * y[3] */ 
00346         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32); 
00347         /* acc2 +=  x[5] * y[3] */ 
00348         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32); 
00349         /* acc3 +=  x[6] * y[3] */ 
00350         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32); 
00351  
00352  
00353       } while(--k); 
00354  
00355       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.  
00356        ** No loop unrolling is used. */ 
00357       k = srcBLen % 0x4u; 
00358  
00359       while(k > 0u) 
00360       { 
00361         /* Read y[4] sample */ 
00362         c0 = *(py++); 
00363  
00364         /* Read x[7] sample */ 
00365         x3 = *(px++); 
00366  
00367         /* Perform the multiply-accumulates */ 
00368         /* acc0 +=  x[4] * y[4] */ 
00369         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32); 
00370         /* acc1 +=  x[5] * y[4] */ 
00371         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32); 
00372         /* acc2 +=  x[6] * y[4] */ 
00373         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32); 
00374         /* acc3 +=  x[7] * y[4] */ 
00375         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32); 
00376  
00377         /* Reuse the present samples for the next MAC */ 
00378         x0 = x1; 
00379         x1 = x2; 
00380         x2 = x3; 
00381  
00382         /* Decrement the loop counter */ 
00383         k--; 
00384       } 
00385  
00386       /* Store the result in the accumulator in the destination buffer. */ 
00387       *pOut = (q31_t) (acc0 << 1); 
00388       /* Destination pointer is updated according to the address modifier, inc */ 
00389       pOut += inc; 
00390  
00391       *pOut = (q31_t) (acc1 << 1); 
00392       pOut += inc; 
00393  
00394       *pOut = (q31_t) (acc2 << 1); 
00395       pOut += inc; 
00396  
00397       *pOut = (q31_t) (acc3 << 1); 
00398       pOut += inc; 
00399  
00400       /* Update the inputA and inputB pointers for next MAC calculation */ 
00401       px = pIn1 + (count * 4u); 
00402       py = pIn2; 
00403  
00404       /* Increment the pointer pIn1 index, count by 1 */ 
00405       count++; 
00406  
00407       /* Decrement the loop counter */ 
00408       blkCnt--; 
00409     } 
00410  
00411     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.  
00412      ** No loop unrolling is used. */ 
00413     blkCnt = blockSize2 % 0x4u; 
00414  
00415     while(blkCnt > 0u) 
00416     { 
00417       /* Accumulator is made zero for every iteration */ 
00418       sum = 0; 
00419  
00420       /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00421       k = srcBLen >> 2u; 
00422  
00423       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00424        ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00425       while(k > 0u) 
00426       { 
00427         /* Perform the multiply-accumulates */ 
00428         sum = (q31_t) ((((q63_t) sum << 32) + 
00429                         ((q63_t) * px++ * (*py++))) >> 32); 
00430         sum = (q31_t) ((((q63_t) sum << 32) + 
00431                         ((q63_t) * px++ * (*py++))) >> 32); 
00432         sum = (q31_t) ((((q63_t) sum << 32) + 
00433                         ((q63_t) * px++ * (*py++))) >> 32); 
00434         sum = (q31_t) ((((q63_t) sum << 32) + 
00435                         ((q63_t) * px++ * (*py++))) >> 32); 
00436  
00437         /* Decrement the loop counter */ 
00438         k--; 
00439       } 
00440  
00441       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.  
00442        ** No loop unrolling is used. */ 
00443       k = srcBLen % 0x4u; 
00444  
00445       while(k > 0u) 
00446       { 
00447         /* Perform the multiply-accumulate */ 
00448         sum = (q31_t) ((((q63_t) sum << 32) + 
00449                         ((q63_t) * px++ * (*py++))) >> 32); 
00450  
00451         /* Decrement the loop counter */ 
00452         k--; 
00453       } 
00454  
00455       /* Store the result in the accumulator in the destination buffer. */ 
00456       *pOut = sum << 1; 
00457       /* Destination pointer is updated according to the address modifier, inc */ 
00458       pOut += inc; 
00459  
00460       /* Update the inputA and inputB pointers for next MAC calculation */ 
00461       px = pIn1 + count; 
00462       py = pIn2; 
00463  
00464       /* Increment the MAC count */ 
00465       count++; 
00466  
00467       /* Decrement the loop counter */ 
00468       blkCnt--; 
00469     } 
00470   } 
00471   else 
00472   { 
00473     /* If the srcBLen is not a multiple of 4,  
00474      * the blockSize2 loop cannot be unrolled by 4 */ 
00475     blkCnt = blockSize2; 
00476  
00477     while(blkCnt > 0u) 
00478     { 
00479       /* Accumulator is made zero for every iteration */ 
00480       sum = 0; 
00481  
00482       /* Loop over srcBLen */ 
00483       k = srcBLen; 
00484  
00485       while(k > 0u) 
00486       { 
00487         /* Perform the multiply-accumulate */ 
00488         sum = (q31_t) ((((q63_t) sum << 32) + 
00489                         ((q63_t) * px++ * (*py++))) >> 32); 
00490  
00491         /* Decrement the loop counter */ 
00492         k--; 
00493       } 
00494  
00495       /* Store the result in the accumulator in the destination buffer. */ 
00496       *pOut = sum << 1; 
00497       /* Destination pointer is updated according to the address modifier, inc */ 
00498       pOut += inc; 
00499  
00500       /* Update the inputA and inputB pointers for next MAC calculation */ 
00501       px = pIn1 + count; 
00502       py = pIn2; 
00503  
00504       /* Increment the MAC count */ 
00505       count++; 
00506  
00507       /* Decrement the loop counter */ 
00508       blkCnt--; 
00509     } 
00510   } 
00511  
00512   /* --------------------------  
00513    * Initializations of stage3  
00514    * -------------------------*/ 
00515  
00516   /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]  
00517    * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]  
00518    * ....  
00519    * sum +=  x[srcALen-2] * y[0] + x[srcALen-1] * y[1]  
00520    * sum +=  x[srcALen-1] * y[0]  
00521    */ 
00522  
00523   /* In this stage the MAC operations are decreased by 1 for every iteration.  
00524      The count variable holds the number of MAC operations performed */ 
00525   count = srcBLen - 1u; 
00526  
00527   /* Working pointer of inputA */ 
00528   pSrc1 = ((pIn1 + srcALen) - srcBLen) + 1u; 
00529   px = pSrc1; 
00530  
00531   /* Working pointer of inputB */ 
00532   py = pIn2; 
00533  
00534   /* -------------------  
00535    * Stage3 process  
00536    * ------------------*/ 
00537  
00538   while(blockSize3 > 0u) 
00539   { 
00540     /* Accumulator is made zero for every iteration */ 
00541     sum = 0; 
00542  
00543     /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00544     k = count >> 2u; 
00545  
00546     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00547      ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00548     while(k > 0u) 
00549     { 
00550       /* Perform the multiply-accumulates */ 
00551       /* sum += x[srcALen - srcBLen + 4] * y[3] */ 
00552       sum = (q31_t) ((((q63_t) sum << 32) + 
00553                       ((q63_t) * px++ * (*py++))) >> 32); 
00554       /* sum += x[srcALen - srcBLen + 3] * y[2] */ 
00555       sum = (q31_t) ((((q63_t) sum << 32) + 
00556                       ((q63_t) * px++ * (*py++))) >> 32); 
00557       /* sum += x[srcALen - srcBLen + 2] * y[1] */ 
00558       sum = (q31_t) ((((q63_t) sum << 32) + 
00559                       ((q63_t) * px++ * (*py++))) >> 32); 
00560       /* sum += x[srcALen - srcBLen + 1] * y[0] */ 
00561       sum = (q31_t) ((((q63_t) sum << 32) + 
00562                       ((q63_t) * px++ * (*py++))) >> 32); 
00563  
00564       /* Decrement the loop counter */ 
00565       k--; 
00566     } 
00567  
00568     /* If the count is not a multiple of 4, compute any remaining MACs here.  
00569      ** No loop unrolling is used. */ 
00570     k = count % 0x4u; 
00571  
00572     while(k > 0u) 
00573     { 
00574       /* Perform the multiply-accumulates */ 
00575       sum = (q31_t) ((((q63_t) sum << 32) + 
00576                       ((q63_t) * px++ * (*py++))) >> 32); 
00577  
00578       /* Decrement the loop counter */ 
00579       k--; 
00580     } 
00581  
00582     /* Store the result in the accumulator in the destination buffer. */ 
00583     *pOut = sum << 1; 
00584     /* Destination pointer is updated according to the address modifier, inc */ 
00585     pOut += inc; 
00586  
00587     /* Update the inputA and inputB pointers for next MAC calculation */ 
00588     px = ++pSrc1; 
00589     py = pIn2; 
00590  
00591     /* Decrement the MAC count */ 
00592     count--; 
00593  
00594     /* Decrement the loop counter */ 
00595     blockSize3--; 
00596   } 
00597  
00598 } 
00599  
00600 /**  
00601  * @} end of Corr group  
00602  */