CMSIS DSP Library from CMSIS 2.0. See http://www.onarm.com/cmsis/ for full details

Dependents:   K22F_DSP_Matrix_least_square BNO055-ELEC3810 1BNO055 ECE4180Project--Slave2 ... more

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_correlate_q15.c Source File

arm_correlate_q15.c

00001 /* ----------------------------------------------------------------------  
00002 * Copyright (C) 2010 ARM Limited. All rights reserved.  
00003 *  
00004 * $Date:        29. November 2010  
00005 * $Revision:    V1.0.3  
00006 *  
00007 * Project:      CMSIS DSP Library  
00008 * Title:        arm_correlate_q15.c  
00009 *  
00010 * Description:  Q15 Correlation.  
00011 *  
00012 * Target Processor: Cortex-M4/Cortex-M3
00013 *  
00014 * Version 1.0.3 2010/11/29 
00015 *    Re-organized the CMSIS folders and updated documentation.  
00016 *   
00017 * Version 1.0.2 2010/11/11  
00018 *    Documentation updated.   
00019 *  
00020 * Version 1.0.1 2010/10/05   
00021 *    Production release and review comments incorporated.  
00022 *  
00023 * Version 1.0.0 2010/09/20   
00024 *    Production release and review comments incorporated  
00025 *  
00026 * Version 0.0.7  2010/06/10   
00027 *    Misra-C changes done  
00028 *  
00029 * -------------------------------------------------------------------- */ 
00030  
00031 #include "arm_math.h" 
00032  
00033 /**  
00034  * @ingroup groupFilters  
00035  */ 
00036  
00037 /**  
00038  * @addtogroup Corr  
00039  * @{  
00040  */ 
00041  
00042 /**  
00043  * @brief Correlation of Q15 sequences  
00044  * @param[in] *pSrcA points to the first input sequence.  
00045  * @param[in] srcALen length of the first input sequence.  
00046  * @param[in] *pSrcB points to the second input sequence.  
00047  * @param[in] srcBLen length of the second input sequence.  
00048  * @param[out] *pDst points to the location where the output result is written.  Length 2 * max(srcALen, srcBLen) - 1.  
00049  * @return none.  
00050  *  
00051  * @details  
00052  * <b>Scaling and Overflow Behavior:</b>  
00053  *  
00054  * \par  
00055  * The function is implemented using a 64-bit internal accumulator.  
00056  * Both inputs are in 1.15 format and multiplications yield a 2.30 result.  
00057  * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.  
00058  * This approach provides 33 guard bits and there is no risk of overflow.  
00059  * The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.  
00060  *  
00061  * \par  
00062  * Refer to <code>arm_correlate_fast_q15()</code> for a faster but less precise version of this function.  
00063  */ 
00064  
00065 void arm_correlate_q15( 
00066   q15_t * pSrcA, 
00067   uint32_t srcALen, 
00068   q15_t * pSrcB, 
00069   uint32_t srcBLen, 
00070   q15_t * pDst) 
00071 { 
00072   q15_t *pIn1;                                   /* inputA pointer               */ 
00073   q15_t *pIn2;                                   /* inputB pointer               */ 
00074   q15_t *pOut = pDst;                            /* output pointer               */ 
00075   q63_t sum, acc0, acc1, acc2, acc3;             /* Accumulators                  */ 
00076   q15_t *px;                                     /* Intermediate inputA pointer  */ 
00077   q15_t *py;                                     /* Intermediate inputB pointer  */ 
00078   q15_t *pSrc1;                                  /* Intermediate pointers        */ 
00079   q31_t x0, x1, x2, x3, c0;                      /* temporary variables for holding input and coefficient values */ 
00080   uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3;  /* loop counter                 */ 
00081   int32_t inc = 1;                               /* Destination address modifier */ 
00082   q31_t *pb;                                     /* 32 bit pointer for inputB buffer */ 
00083  
00084  
00085   /* The algorithm implementation is based on the lengths of the inputs. */ 
00086   /* srcB is always made to slide across srcA. */ 
00087   /* So srcBLen is always considered as shorter or equal to srcALen */ 
00088   /* But CORR(x, y) is reverse of CORR(y, x) */ 
00089   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ 
00090   /* and the destination pointer modifier, inc is set to -1 */ 
00091   /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */ 
00092   /* But to improve the performance,  
00093    * we include zeroes in the output instead of zero padding either of the the inputs*/ 
00094   /* If srcALen > srcBLen,  
00095    * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */ 
00096   /* If srcALen < srcBLen,  
00097    * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */ 
00098   if(srcALen >= srcBLen) 
00099   { 
00100     /* Initialization of inputA pointer */ 
00101     pIn1 = (pSrcA); 
00102  
00103     /* Initialization of inputB pointer */ 
00104     pIn2 = (pSrcB); 
00105  
00106     /* Number of output samples is calculated */ 
00107     outBlockSize = (2u * srcALen) - 1u; 
00108  
00109     /* When srcALen > srcBLen, zero padding is done to srcB  
00110      * to make their lengths equal.  
00111      * Instead, (outBlockSize - (srcALen + srcBLen - 1))  
00112      * number of output samples are made zero */ 
00113     j = outBlockSize - (srcALen + (srcBLen - 1u)); 
00114  
00115     while(j > 0u) 
00116     { 
00117       /* Zero is stored in the destination buffer */ 
00118       *pOut++ = 0; 
00119  
00120       /* Decrement the loop counter */ 
00121       j--; 
00122     } 
00123  
00124   } 
00125   else 
00126   { 
00127     /* Initialization of inputA pointer */ 
00128     pIn1 = (pSrcB); 
00129  
00130     /* Initialization of inputB pointer */ 
00131     pIn2 = (pSrcA); 
00132  
00133     /* srcBLen is always considered as shorter or equal to srcALen */ 
00134     j = srcBLen; 
00135     srcBLen = srcALen; 
00136     srcALen = j; 
00137  
00138     /* CORR(x, y) = Reverse order(CORR(y, x)) */ 
00139     /* Hence set the destination pointer to point to the last output sample */ 
00140     pOut = pDst + ((srcALen + srcBLen) - 2u); 
00141  
00142     /* Destination address modifier is set to -1 */ 
00143     inc = -1; 
00144  
00145   } 
00146  
00147   /* The function is internally  
00148    * divided into three parts according to the number of multiplications that has to be  
00149    * taken place between inputA samples and inputB samples. In the first part of the  
00150    * algorithm, the multiplications increase by one for every iteration.  
00151    * In the second part of the algorithm, srcBLen number of multiplications are done.  
00152    * In the third part of the algorithm, the multiplications decrease by one  
00153    * for every iteration.*/ 
00154   /* The algorithm is implemented in three stages.  
00155    * The loop counters of each stage is initiated here. */ 
00156   blockSize1 = srcBLen - 1u; 
00157   blockSize2 = srcALen - (srcBLen - 1u); 
00158   blockSize3 = blockSize1; 
00159  
00160   /* --------------------------  
00161    * Initializations of stage1  
00162    * -------------------------*/ 
00163  
00164   /* sum = x[0] * y[srcBlen - 1]  
00165    * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]  
00166    * ....  
00167    * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]  
00168    */ 
00169  
00170   /* In this stage the MAC operations are increased by 1 for every iteration.  
00171      The count variable holds the number of MAC operations performed */ 
00172   count = 1u; 
00173  
00174   /* Working pointer of inputA */ 
00175   px = pIn1; 
00176  
00177   /* Working pointer of inputB */ 
00178   pSrc1 = pIn2 + (srcBLen - 1u); 
00179   py = pSrc1; 
00180  
00181   /* ------------------------  
00182    * Stage1 process  
00183    * ----------------------*/ 
00184  
00185   /* The first loop starts here */ 
00186   while(blockSize1 > 0u) 
00187   { 
00188     /* Accumulator is made zero for every iteration */ 
00189     sum = 0; 
00190  
00191     /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00192     k = count >> 2; 
00193  
00194     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00195      ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00196     while(k > 0u) 
00197     { 
00198       /* x[0] * y[srcBLen - 4] , x[1] * y[srcBLen - 3] */ 
00199       sum = __SMLALD(*__SIMD32(px)++, *__SIMD32(py)++, sum); 
00200       /* x[3] * y[srcBLen - 1] , x[2] * y[srcBLen - 2] */ 
00201       sum = __SMLALD(*__SIMD32(px)++, *__SIMD32(py)++, sum); 
00202  
00203       /* Decrement the loop counter */ 
00204       k--; 
00205     } 
00206  
00207     /* If the count is not a multiple of 4, compute any remaining MACs here.  
00208      ** No loop unrolling is used. */ 
00209     k = count % 0x4u; 
00210  
00211     while(k > 0u) 
00212     { 
00213       /* Perform the multiply-accumulates */ 
00214       /* x[0] * y[srcBLen - 1] */ 
00215       sum = __SMLALD(*px++, *py++, sum); 
00216  
00217       /* Decrement the loop counter */ 
00218       k--; 
00219     } 
00220  
00221     /* Store the result in the accumulator in the destination buffer. */ 
00222     *pOut = (q15_t) (__SSAT((sum >> 15), 16)); 
00223     /* Destination pointer is updated according to the address modifier, inc */ 
00224     pOut += inc; 
00225  
00226     /* Update the inputA and inputB pointers for next MAC calculation */ 
00227     py = pSrc1 - count; 
00228     px = pIn1; 
00229  
00230     /* Increment the MAC count */ 
00231     count++; 
00232  
00233     /* Decrement the loop counter */ 
00234     blockSize1--; 
00235   } 
00236  
00237   /* --------------------------  
00238    * Initializations of stage2  
00239    * ------------------------*/ 
00240  
00241   /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]  
00242    * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]  
00243    * ....  
00244    * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]  
00245    */ 
00246  
00247   /* Working pointer of inputA */ 
00248   px = pIn1; 
00249  
00250   /* Working pointer of inputB */ 
00251   py = pIn2; 
00252  
00253   /* Initialize inputB pointer of type q31 */ 
00254   pb = (q31_t *) (py); 
00255  
00256   /* count is index by which the pointer pIn1 to be incremented */ 
00257   count = 0u; 
00258  
00259   /* -------------------  
00260    * Stage2 process  
00261    * ------------------*/ 
00262  
00263   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.  
00264    * So, to loop unroll over blockSize2,  
00265    * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */ 
00266   if(srcBLen >= 4u) 
00267   { 
00268     /* Loop unroll over blockSize2, by 4 */ 
00269     blkCnt = blockSize2 >> 2u; 
00270  
00271     while(blkCnt > 0u) 
00272     { 
00273       /* Set all accumulators to zero */ 
00274       acc0 = 0; 
00275       acc1 = 0; 
00276       acc2 = 0; 
00277       acc3 = 0; 
00278  
00279       /* read x[0], x[1] samples */ 
00280       x0 = *(q31_t *) (px++); 
00281       /* read x[1], x[2] samples */ 
00282       x1 = *(q31_t *) (px++); 
00283  
00284       /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00285       k = srcBLen >> 2u; 
00286  
00287       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00288        ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00289       do 
00290       { 
00291         /* Read the first two inputB samples using SIMD:  
00292          * y[0] and y[1] */ 
00293         c0 = *(pb++); 
00294  
00295         /* acc0 +=  x[0] * y[0] + x[1] * y[1] */ 
00296         acc0 = __SMLALD(x0, c0, acc0); 
00297  
00298         /* acc1 +=  x[1] * y[0] + x[2] * y[1] */ 
00299         acc1 = __SMLALD(x1, c0, acc1); 
00300  
00301         /* Read x[2], x[3] */ 
00302         x2 = *(q31_t *) (px++); 
00303  
00304         /* Read x[3], x[4] */ 
00305         x3 = *(q31_t *) (px++); 
00306  
00307         /* acc2 +=  x[2] * y[0] + x[3] * y[1] */ 
00308         acc2 = __SMLALD(x2, c0, acc2); 
00309  
00310         /* acc3 +=  x[3] * y[0] + x[4] * y[1] */ 
00311         acc3 = __SMLALD(x3, c0, acc3); 
00312  
00313         /* Read y[2] and y[3] */ 
00314         c0 = *(pb++); 
00315  
00316         /* acc0 +=  x[2] * y[2] + x[3] * y[3] */ 
00317         acc0 = __SMLALD(x2, c0, acc0); 
00318  
00319         /* acc1 +=  x[3] * y[2] + x[4] * y[3] */ 
00320         acc1 = __SMLALD(x3, c0, acc1); 
00321  
00322         /* Read x[4], x[5] */ 
00323         x0 = *(q31_t *) (px++); 
00324  
00325         /* Read x[5], x[6] */ 
00326         x1 = *(q31_t *) (px++); 
00327  
00328         /* acc2 +=  x[4] * y[2] + x[5] * y[3] */ 
00329         acc2 = __SMLALD(x0, c0, acc2); 
00330  
00331         /* acc3 +=  x[5] * y[2] + x[6] * y[3] */ 
00332         acc3 = __SMLALD(x1, c0, acc3); 
00333  
00334       } while(--k); 
00335  
00336       /* For the next MAC operations, SIMD is not used  
00337        * So, the 16 bit pointer if inputB, py is updated */ 
00338       py = (q15_t *) (pb); 
00339  
00340       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.  
00341        ** No loop unrolling is used. */ 
00342       k = srcBLen % 0x4u; 
00343  
00344       if(k == 1u) 
00345       { 
00346         /* Read y[4] */ 
00347         c0 = *py; 
00348         c0 = c0 & 0x0000FFFF; 
00349  
00350         /* Read x[7] */ 
00351         x3 = *(q31_t *) px++; 
00352  
00353         /* Perform the multiply-accumulates */ 
00354         acc0 = __SMLALD(x0, c0, acc0); 
00355         acc1 = __SMLALD(x1, c0, acc1); 
00356         acc2 = __SMLALDX(x1, c0, acc2); 
00357         acc3 = __SMLALDX(x3, c0, acc3); 
00358       } 
00359  
00360       if(k == 2u) 
00361       { 
00362         /* Read y[4], y[5] */ 
00363         c0 = *(pb); 
00364  
00365         /* Read x[7], x[8] */ 
00366         x3 = *(q31_t *) px++; 
00367  
00368         /* Read x[9] */ 
00369         x2 = *(q31_t *) px++; 
00370  
00371         /* Perform the multiply-accumulates */ 
00372         acc0 = __SMLALD(x0, c0, acc0); 
00373         acc1 = __SMLALD(x1, c0, acc1); 
00374         acc2 = __SMLALD(x3, c0, acc2); 
00375         acc3 = __SMLALD(x2, c0, acc3); 
00376       } 
00377  
00378       if(k == 3u) 
00379       { 
00380         /* Read y[4], y[5] */ 
00381         c0 = *pb++; 
00382  
00383         /* Read x[7], x[8] */ 
00384         x3 = *(q31_t *) px++; 
00385  
00386         /* Read x[9] */ 
00387         x2 = *(q31_t *) px++; 
00388  
00389         /* Perform the multiply-accumulates */ 
00390         acc0 = __SMLALD(x0, c0, acc0); 
00391         acc1 = __SMLALD(x1, c0, acc1); 
00392         acc2 = __SMLALD(x3, c0, acc2); 
00393         acc3 = __SMLALD(x2, c0, acc3); 
00394  
00395         /* Read y[6] */ 
00396         c0 = (q15_t) (*pb); 
00397         c0 = c0 & 0x0000FFFF; 
00398  
00399         /* Read x[10] */ 
00400         x3 = *(q31_t *) px++; 
00401  
00402         /* Perform the multiply-accumulates */ 
00403         acc0 = __SMLALDX(x1, c0, acc0); 
00404         acc1 = __SMLALD(x2, c0, acc1); 
00405         acc2 = __SMLALDX(x2, c0, acc2); 
00406         acc3 = __SMLALDX(x3, c0, acc3); 
00407       } 
00408  
00409       /* Store the result in the accumulator in the destination buffer. */ 
00410       *pOut = (q15_t) (__SSAT(acc0 >> 15, 16)); 
00411       /* Destination pointer is updated according to the address modifier, inc */ 
00412       pOut += inc; 
00413  
00414       *pOut = (q15_t) (__SSAT(acc1 >> 15, 16)); 
00415       pOut += inc; 
00416  
00417       *pOut = (q15_t) (__SSAT(acc2 >> 15, 16)); 
00418       pOut += inc; 
00419  
00420       *pOut = (q15_t) (__SSAT(acc3 >> 15, 16)); 
00421       pOut += inc; 
00422  
00423       /* Increment the count by 4 as 4 output values are computed */ 
00424       count += 4u; 
00425  
00426       /* Update the inputA and inputB pointers for next MAC calculation */ 
00427       px = pIn1 + count; 
00428       py = pIn2; 
00429       pb = (q31_t *) (py); 
00430  
00431  
00432       /* Decrement the loop counter */ 
00433       blkCnt--; 
00434     } 
00435  
00436     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.  
00437      ** No loop unrolling is used. */ 
00438     blkCnt = blockSize2 % 0x4u; 
00439  
00440     while(blkCnt > 0u) 
00441     { 
00442       /* Accumulator is made zero for every iteration */ 
00443       sum = 0; 
00444  
00445       /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00446       k = srcBLen >> 2u; 
00447  
00448       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00449        ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00450       while(k > 0u) 
00451       { 
00452         /* Perform the multiply-accumulates */ 
00453         sum += ((q63_t) * px++ * *py++); 
00454         sum += ((q63_t) * px++ * *py++); 
00455         sum += ((q63_t) * px++ * *py++); 
00456         sum += ((q63_t) * px++ * *py++); 
00457  
00458         /* Decrement the loop counter */ 
00459         k--; 
00460       } 
00461  
00462       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.  
00463        ** No loop unrolling is used. */ 
00464       k = srcBLen % 0x4u; 
00465  
00466       while(k > 0u) 
00467       { 
00468         /* Perform the multiply-accumulates */ 
00469         sum += ((q63_t) * px++ * *py++); 
00470  
00471         /* Decrement the loop counter */ 
00472         k--; 
00473       } 
00474  
00475       /* Store the result in the accumulator in the destination buffer. */ 
00476       *pOut = (q15_t) (__SSAT(sum >> 15, 16)); 
00477       /* Destination pointer is updated according to the address modifier, inc */ 
00478       pOut += inc; 
00479  
00480       /* Increment count by 1, as one output value is computed */ 
00481       count++; 
00482  
00483       /* Update the inputA and inputB pointers for next MAC calculation */ 
00484       px = pIn1 + count; 
00485       py = pIn2; 
00486  
00487       /* Decrement the loop counter */ 
00488       blkCnt--; 
00489     } 
00490   } 
00491   else 
00492   { 
00493     /* If the srcBLen is not a multiple of 4,  
00494      * the blockSize2 loop cannot be unrolled by 4 */ 
00495     blkCnt = blockSize2; 
00496  
00497     while(blkCnt > 0u) 
00498     { 
00499       /* Accumulator is made zero for every iteration */ 
00500       sum = 0; 
00501  
00502       /* Loop over srcBLen */ 
00503       k = srcBLen; 
00504  
00505       while(k > 0u) 
00506       { 
00507         /* Perform the multiply-accumulate */ 
00508         sum += ((q63_t) * px++ * *py++); 
00509  
00510         /* Decrement the loop counter */ 
00511         k--; 
00512       } 
00513  
00514       /* Store the result in the accumulator in the destination buffer. */ 
00515       *pOut = (q15_t) (__SSAT(sum >> 15, 16)); 
00516       /* Destination pointer is updated according to the address modifier, inc */ 
00517       pOut += inc; 
00518  
00519       /* Increment the MAC count */ 
00520       count++; 
00521  
00522       /* Update the inputA and inputB pointers for next MAC calculation */ 
00523       px = pIn1 + count; 
00524       py = pIn2; 
00525  
00526       /* Decrement the loop counter */ 
00527       blkCnt--; 
00528     } 
00529   } 
00530  
00531   /* --------------------------  
00532    * Initializations of stage3  
00533    * -------------------------*/ 
00534  
00535   /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]  
00536    * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]  
00537    * ....  
00538    * sum +=  x[srcALen-2] * y[0] + x[srcALen-1] * y[1]  
00539    * sum +=  x[srcALen-1] * y[0]  
00540    */ 
00541  
00542   /* In this stage the MAC operations are decreased by 1 for every iteration.  
00543      The count variable holds the number of MAC operations performed */ 
00544   count = srcBLen - 1u; 
00545  
00546   /* Working pointer of inputA */ 
00547   pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 
00548   px = pSrc1; 
00549  
00550   /* Working pointer of inputB */ 
00551   py = pIn2; 
00552  
00553   /* -------------------  
00554    * Stage3 process  
00555    * ------------------*/ 
00556  
00557   while(blockSize3 > 0u) 
00558   { 
00559     /* Accumulator is made zero for every iteration */ 
00560     sum = 0; 
00561  
00562     /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00563     k = count >> 2u; 
00564  
00565     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00566      ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00567     while(k > 0u) 
00568     { 
00569       /* Perform the multiply-accumulates */ 
00570       /* sum += x[srcALen - srcBLen + 4] * y[3] , sum += x[srcALen - srcBLen + 3] * y[2] */ 
00571       sum = __SMLALD(*__SIMD32(px)++, *__SIMD32(py)++, sum); 
00572       /* sum += x[srcALen - srcBLen + 2] * y[1] , sum += x[srcALen - srcBLen + 1] * y[0] */ 
00573       sum = __SMLALD(*__SIMD32(px)++, *__SIMD32(py)++, sum); 
00574  
00575       /* Decrement the loop counter */ 
00576       k--; 
00577     } 
00578  
00579     /* If the count is not a multiple of 4, compute any remaining MACs here.  
00580      ** No loop unrolling is used. */ 
00581     k = count % 0x4u; 
00582  
00583     while(k > 0u) 
00584     { 
00585       /* Perform the multiply-accumulates */ 
00586       sum = __SMLALD(*px++, *py++, sum); 
00587  
00588       /* Decrement the loop counter */ 
00589       k--; 
00590     } 
00591  
00592     /* Store the result in the accumulator in the destination buffer. */ 
00593     *pOut = (q15_t) (__SSAT((sum >> 15), 16)); 
00594     /* Destination pointer is updated according to the address modifier, inc */ 
00595     pOut += inc; 
00596  
00597     /* Update the inputA and inputB pointers for next MAC calculation */ 
00598     px = ++pSrc1; 
00599     py = pIn2; 
00600  
00601     /* Decrement the MAC count */ 
00602     count--; 
00603  
00604     /* Decrement the loop counter */ 
00605     blockSize3--; 
00606   } 
00607  
00608 } 
00609  
00610 /**  
00611  * @} end of Corr group  
00612  */