CMSIS DSP Library from CMSIS 2.0. See http://www.onarm.com/cmsis/ for full details

Dependents:   K22F_DSP_Matrix_least_square BNO055-ELEC3810 1BNO055 ECE4180Project--Slave2 ... more

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_correlate_fast_q15.c Source File

arm_correlate_fast_q15.c

00001 /* ----------------------------------------------------------------------  
00002 * Copyright (C) 2010 ARM Limited. All rights reserved.  
00003 *  
00004 * $Date:        29. November 2010  
00005 * $Revision:    V1.0.3  
00006 *  
00007 * Project:      CMSIS DSP Library  
00008 * Title:        arm_correlate_fast_q15.c  
00009 *  
00010 * Description:  Fast Q15 Correlation.  
00011 *  
00012 * Target Processor: Cortex-M4/Cortex-M3
00013 *  
00014 * Version 1.0.3 2010/11/29 
00015 *    Re-organized the CMSIS folders and updated documentation.  
00016 *   
00017 * Version 1.0.2 2010/11/11  
00018 *    Documentation updated.   
00019 *  
00020 * Version 1.0.1 2010/10/05   
00021 *    Production release and review comments incorporated.  
00022 *  
00023 * Version 1.0.0 2010/09/20   
00024 *    Production release and review comments incorporated.  
00025 * -------------------------------------------------------------------- */ 
00026  
00027 #include "arm_math.h" 
00028  
00029 /**  
00030  * @ingroup groupFilters  
00031  */ 
00032  
00033 /**  
00034  * @addtogroup Corr  
00035  * @{  
00036  */ 
00037  
00038 /**  
00039  * @brief Correlation of Q15 sequences (fast version).  
00040  * @param[in] *pSrcA points to the first input sequence.  
00041  * @param[in] srcALen length of the first input sequence.  
00042  * @param[in] *pSrcB points to the second input sequence.  
00043  * @param[in] srcBLen length of the second input sequence.  
00044  * @param[out] *pDst points to the location where the output result is written.  Length 2 * max(srcALen, srcBLen) - 1.  
00045  * @return none.  
00046  *  
00047  * <b>Scaling and Overflow Behavior:</b>  
00048  *  
00049  * \par  
00050  * This fast version uses a 32-bit accumulator with 2.30 format.  
00051  * The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit.  
00052  * There is no saturation on intermediate additions.  
00053  * Thus, if the accumulator overflows it wraps around and distorts the result.  
00054  * The input signals should be scaled down to avoid intermediate overflows.  
00055  * Scale down one of the inputs by 1/min(srcALen, srcBLen) to avoid overflow since a  
00056  * maximum of min(srcALen, srcBLen) number of additions is carried internally.  
00057  * The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result.  
00058  *  
00059  * \par  
00060  * See <code>arm_correlate_q15()</code> for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion.  
00061  */ 
00062  
00063 void arm_correlate_fast_q15( 
00064   q15_t * pSrcA, 
00065   uint32_t srcALen, 
00066   q15_t * pSrcB, 
00067   uint32_t srcBLen, 
00068   q15_t * pDst) 
00069 { 
00070   q15_t *pIn1;                                   /* inputA pointer               */ 
00071   q15_t *pIn2;                                   /* inputB pointer               */ 
00072   q15_t *pOut = pDst;                            /* output pointer               */ 
00073   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulators                  */ 
00074   q15_t *px;                                     /* Intermediate inputA pointer  */ 
00075   q15_t *py;                                     /* Intermediate inputB pointer  */ 
00076   q15_t *pSrc1;                                  /* Intermediate pointers        */ 
00077   q31_t x0, x1, x2, x3, c0;                      /* temporary variables for holding input and coefficient values */ 
00078   uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3;  /* loop counter                 */ 
00079   int32_t inc = 1;                               /* Destination address modifier */ 
00080   q31_t *pb;                                     /* 32 bit pointer for inputB buffer */ 
00081  
00082  
00083   /* The algorithm implementation is based on the lengths of the inputs. */ 
00084   /* srcB is always made to slide across srcA. */ 
00085   /* So srcBLen is always considered as shorter or equal to srcALen */ 
00086   /* But CORR(x, y) is reverse of CORR(y, x) */ 
00087   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ 
00088   /* and the destination pointer modifier, inc is set to -1 */ 
00089   /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */ 
00090   /* But to improve the performance,  
00091    * we include zeroes in the output instead of zero padding either of the the inputs*/ 
00092   /* If srcALen > srcBLen,  
00093    * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */ 
00094   /* If srcALen < srcBLen,  
00095    * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */ 
00096   if(srcALen >= srcBLen) 
00097   { 
00098     /* Initialization of inputA pointer */ 
00099     pIn1 = (pSrcA); 
00100  
00101     /* Initialization of inputB pointer */ 
00102     pIn2 = (pSrcB); 
00103  
00104     /* Number of output samples is calculated */ 
00105     outBlockSize = (2u * srcALen) - 1u; 
00106  
00107     /* When srcALen > srcBLen, zero padding is done to srcB  
00108      * to make their lengths equal.  
00109      * Instead, (outBlockSize - (srcALen + srcBLen - 1))  
00110      * number of output samples are made zero */ 
00111     j = outBlockSize - (srcALen + (srcBLen - 1u)); 
00112  
00113     while(j > 0u) 
00114     { 
00115       /* Zero is stored in the destination buffer */ 
00116       *pOut++ = 0; 
00117  
00118       /* Decrement the loop counter */ 
00119       j--; 
00120     } 
00121  
00122   } 
00123   else 
00124   { 
00125     /* Initialization of inputA pointer */ 
00126     pIn1 = (pSrcB); 
00127  
00128     /* Initialization of inputB pointer */ 
00129     pIn2 = (pSrcA); 
00130  
00131     /* srcBLen is always considered as shorter or equal to srcALen */ 
00132     j = srcBLen; 
00133     srcBLen = srcALen; 
00134     srcALen = j; 
00135  
00136     /* CORR(x, y) = Reverse order(CORR(y, x)) */ 
00137     /* Hence set the destination pointer to point to the last output sample */ 
00138     pOut = pDst + ((srcALen + srcBLen) - 2u); 
00139  
00140     /* Destination address modifier is set to -1 */ 
00141     inc = -1; 
00142  
00143   } 
00144  
00145   /* The function is internally  
00146    * divided into three parts according to the number of multiplications that has to be  
00147    * taken place between inputA samples and inputB samples. In the first part of the  
00148    * algorithm, the multiplications increase by one for every iteration.  
00149    * In the second part of the algorithm, srcBLen number of multiplications are done.  
00150    * In the third part of the algorithm, the multiplications decrease by one  
00151    * for every iteration.*/ 
00152   /* The algorithm is implemented in three stages.  
00153    * The loop counters of each stage is initiated here. */ 
00154   blockSize1 = srcBLen - 1u; 
00155   blockSize2 = srcALen - (srcBLen - 1u); 
00156   blockSize3 = blockSize1; 
00157  
00158   /* --------------------------  
00159    * Initializations of stage1  
00160    * -------------------------*/ 
00161  
00162   /* sum = x[0] * y[srcBlen - 1]  
00163    * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]  
00164    * ....  
00165    * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]  
00166    */ 
00167  
00168   /* In this stage the MAC operations are increased by 1 for every iteration.  
00169      The count variable holds the number of MAC operations performed */ 
00170   count = 1u; 
00171  
00172   /* Working pointer of inputA */ 
00173   px = pIn1; 
00174  
00175   /* Working pointer of inputB */ 
00176   pSrc1 = pIn2 + (srcBLen - 1u); 
00177   py = pSrc1; 
00178  
00179   /* ------------------------  
00180    * Stage1 process  
00181    * ----------------------*/ 
00182  
00183   /* The first loop starts here */ 
00184   while(blockSize1 > 0u) 
00185   { 
00186     /* Accumulator is made zero for every iteration */ 
00187     sum = 0; 
00188  
00189     /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00190     k = count >> 2; 
00191  
00192     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00193      ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00194     while(k > 0u) 
00195     { 
00196       /* x[0] * y[srcBLen - 4] , x[1] * y[srcBLen - 3] */ 
00197       sum = __SMLAD(*__SIMD32(px)++, *__SIMD32(py)++, sum); 
00198       /* x[3] * y[srcBLen - 1] , x[2] * y[srcBLen - 2] */ 
00199       sum = __SMLAD(*__SIMD32(px)++, *__SIMD32(py)++, sum); 
00200  
00201       /* Decrement the loop counter */ 
00202       k--; 
00203     } 
00204  
00205     /* If the count is not a multiple of 4, compute any remaining MACs here.  
00206      ** No loop unrolling is used. */ 
00207     k = count % 0x4u; 
00208  
00209     while(k > 0u) 
00210     { 
00211       /* Perform the multiply-accumulates */ 
00212       /* x[0] * y[srcBLen - 1] */ 
00213       sum = __SMLAD(*px++, *py++, sum); 
00214  
00215       /* Decrement the loop counter */ 
00216       k--; 
00217     } 
00218  
00219     /* Store the result in the accumulator in the destination buffer. */ 
00220     *pOut = (q15_t) (sum >> 15); 
00221     /* Destination pointer is updated according to the address modifier, inc */ 
00222     pOut += inc; 
00223  
00224     /* Update the inputA and inputB pointers for next MAC calculation */ 
00225     py = pSrc1 - count; 
00226     px = pIn1; 
00227  
00228     /* Increment the MAC count */ 
00229     count++; 
00230  
00231     /* Decrement the loop counter */ 
00232     blockSize1--; 
00233   } 
00234  
00235   /* --------------------------  
00236    * Initializations of stage2  
00237    * ------------------------*/ 
00238  
00239   /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]  
00240    * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]  
00241    * ....  
00242    * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]  
00243    */ 
00244  
00245   /* Working pointer of inputA */ 
00246   px = pIn1; 
00247  
00248   /* Working pointer of inputB */ 
00249   py = pIn2; 
00250  
00251   /* Initialize inputB pointer of type q31 */ 
00252   pb = (q31_t *) (py); 
00253  
00254   /* count is index by which the pointer pIn1 to be incremented */ 
00255   count = 0u; 
00256  
00257   /* -------------------  
00258    * Stage2 process  
00259    * ------------------*/ 
00260  
00261   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.  
00262    * So, to loop unroll over blockSize2,  
00263    * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */ 
00264   if(srcBLen >= 4u) 
00265   { 
00266     /* Loop unroll over blockSize2, by 4 */ 
00267     blkCnt = blockSize2 >> 2u; 
00268  
00269     while(blkCnt > 0u) 
00270     { 
00271       /* Set all accumulators to zero */ 
00272       acc0 = 0; 
00273       acc1 = 0; 
00274       acc2 = 0; 
00275       acc3 = 0; 
00276  
00277       /* read x[0], x[1] samples */ 
00278       x0 = *(q31_t *) (px++); 
00279       /* read x[1], x[2] samples */ 
00280       x1 = *(q31_t *) (px++); 
00281  
00282       /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00283       k = srcBLen >> 2u; 
00284  
00285       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00286        ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00287       do 
00288       { 
00289         /* Read the first two inputB samples using SIMD:  
00290          * y[0] and y[1] */ 
00291         c0 = *(pb++); 
00292  
00293         /* acc0 +=  x[0] * y[0] + x[1] * y[1] */ 
00294         acc0 = __SMLAD(x0, c0, acc0); 
00295  
00296         /* acc1 +=  x[1] * y[0] + x[2] * y[1] */ 
00297         acc1 = __SMLAD(x1, c0, acc1); 
00298  
00299         /* Read x[2], x[3] */ 
00300         x2 = *(q31_t *) (px++); 
00301  
00302         /* Read x[3], x[4] */ 
00303         x3 = *(q31_t *) (px++); 
00304  
00305         /* acc2 +=  x[2] * y[0] + x[3] * y[1] */ 
00306         acc2 = __SMLAD(x2, c0, acc2); 
00307  
00308         /* acc3 +=  x[3] * y[0] + x[4] * y[1] */ 
00309         acc3 = __SMLAD(x3, c0, acc3); 
00310  
00311         /* Read y[2] and y[3] */ 
00312         c0 = *(pb++); 
00313  
00314         /* acc0 +=  x[2] * y[2] + x[3] * y[3] */ 
00315         acc0 = __SMLAD(x2, c0, acc0); 
00316  
00317         /* acc1 +=  x[3] * y[2] + x[4] * y[3] */ 
00318         acc1 = __SMLAD(x3, c0, acc1); 
00319  
00320         /* Read x[4], x[5] */ 
00321         x0 = *(q31_t *) (px++); 
00322  
00323         /* Read x[5], x[6] */ 
00324         x1 = *(q31_t *) (px++); 
00325  
00326         /* acc2 +=  x[4] * y[2] + x[5] * y[3] */ 
00327         acc2 = __SMLAD(x0, c0, acc2); 
00328  
00329         /* acc3 +=  x[5] * y[2] + x[6] * y[3] */ 
00330         acc3 = __SMLAD(x1, c0, acc3); 
00331  
00332       } while(--k); 
00333  
00334       /* For the next MAC operations, SIMD is not used  
00335        * So, the 16 bit pointer if inputB, py is updated */ 
00336       py = (q15_t *) (pb); 
00337  
00338       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.  
00339        ** No loop unrolling is used. */ 
00340       k = srcBLen % 0x4u; 
00341  
00342       if(k == 1u) 
00343       { 
00344         /* Read y[4] */ 
00345         c0 = *py; 
00346         c0 = c0 & 0x0000FFFF; 
00347  
00348         /* Read x[7] */ 
00349         x3 = *(q31_t *) px++; 
00350  
00351         /* Perform the multiply-accumulates */ 
00352         acc0 = __SMLAD(x0, c0, acc0); 
00353         acc1 = __SMLAD(x1, c0, acc1); 
00354         acc2 = __SMLADX(x1, c0, acc2); 
00355         acc3 = __SMLADX(x3, c0, acc3); 
00356       } 
00357  
00358       if(k == 2u) 
00359       { 
00360         /* Read y[4], y[5] */ 
00361         c0 = *(pb); 
00362  
00363         /* Read x[7], x[8] */ 
00364         x3 = *(q31_t *) px++; 
00365  
00366         /* Read x[9] */ 
00367         x2 = *(q31_t *) px++; 
00368  
00369         /* Perform the multiply-accumulates */ 
00370         acc0 = __SMLAD(x0, c0, acc0); 
00371         acc1 = __SMLAD(x1, c0, acc1); 
00372         acc2 = __SMLAD(x3, c0, acc2); 
00373         acc3 = __SMLAD(x2, c0, acc3); 
00374       } 
00375  
00376       if(k == 3u) 
00377       { 
00378         /* Read y[4], y[5] */ 
00379         c0 = *pb++; 
00380  
00381         /* Read x[7], x[8] */ 
00382         x3 = *(q31_t *) px++; 
00383  
00384         /* Read x[9] */ 
00385         x2 = *(q31_t *) px++; 
00386  
00387         /* Perform the multiply-accumulates */ 
00388         acc0 = __SMLAD(x0, c0, acc0); 
00389         acc1 = __SMLAD(x1, c0, acc1); 
00390         acc2 = __SMLAD(x3, c0, acc2); 
00391         acc3 = __SMLAD(x2, c0, acc3); 
00392  
00393         /* Read y[6] */ 
00394         c0 = (q15_t) (*pb); 
00395         c0 = c0 & 0x0000FFFF; 
00396  
00397         /* Read x[10] */ 
00398         x3 = *(q31_t *) px++; 
00399  
00400         /* Perform the multiply-accumulates */ 
00401         acc0 = __SMLADX(x1, c0, acc0); 
00402         acc1 = __SMLAD(x2, c0, acc1); 
00403         acc2 = __SMLADX(x2, c0, acc2); 
00404         acc3 = __SMLADX(x3, c0, acc3); 
00405       } 
00406  
00407       /* Store the result in the accumulator in the destination buffer. */ 
00408       *pOut = (q15_t) (acc0 >> 15); 
00409       /* Destination pointer is updated according to the address modifier, inc */ 
00410       pOut += inc; 
00411  
00412       *pOut = (q15_t) (acc1 >> 15); 
00413       pOut += inc; 
00414  
00415       *pOut = (q15_t) (acc2 >> 15); 
00416       pOut += inc; 
00417  
00418       *pOut = (q15_t) (acc3 >> 15); 
00419       pOut += inc; 
00420  
00421       /* Increment the pointer pIn1 index, count by 1 */ 
00422       count += 4u; 
00423  
00424       /* Update the inputA and inputB pointers for next MAC calculation */ 
00425       px = pIn1 + count; 
00426       py = pIn2; 
00427       pb = (q31_t *) (py); 
00428  
00429  
00430       /* Decrement the loop counter */ 
00431       blkCnt--; 
00432     } 
00433  
00434     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.  
00435      ** No loop unrolling is used. */ 
00436     blkCnt = blockSize2 % 0x4u; 
00437  
00438     while(blkCnt > 0u) 
00439     { 
00440       /* Accumulator is made zero for every iteration */ 
00441       sum = 0; 
00442  
00443       /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00444       k = srcBLen >> 2u; 
00445  
00446       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00447        ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00448       while(k > 0u) 
00449       { 
00450         /* Perform the multiply-accumulates */ 
00451         sum += ((q31_t) * px++ * *py++); 
00452         sum += ((q31_t) * px++ * *py++); 
00453         sum += ((q31_t) * px++ * *py++); 
00454         sum += ((q31_t) * px++ * *py++); 
00455  
00456         /* Decrement the loop counter */ 
00457         k--; 
00458       } 
00459  
00460       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.  
00461        ** No loop unrolling is used. */ 
00462       k = srcBLen % 0x4u; 
00463  
00464       while(k > 0u) 
00465       { 
00466         /* Perform the multiply-accumulates */ 
00467         sum += ((q31_t) * px++ * *py++); 
00468  
00469         /* Decrement the loop counter */ 
00470         k--; 
00471       } 
00472  
00473       /* Store the result in the accumulator in the destination buffer. */ 
00474       *pOut = (q15_t) (sum >> 15); 
00475       /* Destination pointer is updated according to the address modifier, inc */ 
00476       pOut += inc; 
00477  
00478       /* Increment the pointer pIn1 index, count by 1 */ 
00479       count++; 
00480  
00481       /* Update the inputA and inputB pointers for next MAC calculation */ 
00482       px = pIn1 + count; 
00483       py = pIn2; 
00484  
00485       /* Decrement the loop counter */ 
00486       blkCnt--; 
00487     } 
00488   } 
00489   else 
00490   { 
00491     /* If the srcBLen is not a multiple of 4,  
00492      * the blockSize2 loop cannot be unrolled by 4 */ 
00493     blkCnt = blockSize2; 
00494  
00495     while(blkCnt > 0u) 
00496     { 
00497       /* Accumulator is made zero for every iteration */ 
00498       sum = 0; 
00499  
00500       /* Loop over srcBLen */ 
00501       k = srcBLen; 
00502  
00503       while(k > 0u) 
00504       { 
00505         /* Perform the multiply-accumulate */ 
00506         sum += ((q31_t) * px++ * *py++); 
00507  
00508         /* Decrement the loop counter */ 
00509         k--; 
00510       } 
00511  
00512       /* Store the result in the accumulator in the destination buffer. */ 
00513       *pOut = (q15_t) (sum >> 15); 
00514       /* Destination pointer is updated according to the address modifier, inc */ 
00515       pOut += inc; 
00516  
00517       /* Increment the MAC count */ 
00518       count++; 
00519  
00520       /* Update the inputA and inputB pointers for next MAC calculation */ 
00521       px = pIn1 + count; 
00522       py = pIn2; 
00523  
00524       /* Decrement the loop counter */ 
00525       blkCnt--; 
00526     } 
00527   } 
00528  
00529   /* --------------------------  
00530    * Initializations of stage3  
00531    * -------------------------*/ 
00532  
00533   /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]  
00534    * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]  
00535    * ....  
00536    * sum +=  x[srcALen-2] * y[0] + x[srcALen-1] * y[1]  
00537    * sum +=  x[srcALen-1] * y[0]  
00538    */ 
00539  
00540   /* In this stage the MAC operations are decreased by 1 for every iteration.  
00541      The count variable holds the number of MAC operations performed */ 
00542   count = srcBLen - 1u; 
00543  
00544   /* Working pointer of inputA */ 
00545   pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 
00546   px = pSrc1; 
00547  
00548   /* Working pointer of inputB */ 
00549   py = pIn2; 
00550  
00551   /* -------------------  
00552    * Stage3 process  
00553    * ------------------*/ 
00554  
00555   while(blockSize3 > 0u) 
00556   { 
00557     /* Accumulator is made zero for every iteration */ 
00558     sum = 0; 
00559  
00560     /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00561     k = count >> 2u; 
00562  
00563     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00564      ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00565     while(k > 0u) 
00566     { 
00567       /* Perform the multiply-accumulates */ 
00568       /* sum += x[srcALen - srcBLen + 4] * y[3] , sum += x[srcALen - srcBLen + 3] * y[2] */ 
00569       sum = __SMLAD(*__SIMD32(px)++, *__SIMD32(py)++, sum); 
00570       /* sum += x[srcALen - srcBLen + 2] * y[1] , sum += x[srcALen - srcBLen + 1] * y[0] */ 
00571       sum = __SMLAD(*__SIMD32(px)++, *__SIMD32(py)++, sum); 
00572  
00573       /* Decrement the loop counter */ 
00574       k--; 
00575     } 
00576  
00577     /* If the count is not a multiple of 4, compute any remaining MACs here.  
00578      ** No loop unrolling is used. */ 
00579     k = count % 0x4u; 
00580  
00581     while(k > 0u) 
00582     { 
00583       /* Perform the multiply-accumulates */ 
00584       sum = __SMLAD(*px++, *py++, sum); 
00585  
00586       /* Decrement the loop counter */ 
00587       k--; 
00588     } 
00589  
00590     /* Store the result in the accumulator in the destination buffer. */ 
00591     *pOut = (q15_t) (sum >> 15); 
00592     /* Destination pointer is updated according to the address modifier, inc */ 
00593     pOut += inc; 
00594  
00595     /* Update the inputA and inputB pointers for next MAC calculation */ 
00596     px = ++pSrc1; 
00597     py = pIn2; 
00598  
00599     /* Decrement the MAC count */ 
00600     count--; 
00601  
00602     /* Decrement the loop counter */ 
00603     blockSize3--; 
00604   } 
00605  
00606 } 
00607  
00608 /**  
00609  * @} end of Corr group  
00610  */