CMSIS DSP Library from CMSIS 2.0. See http://www.onarm.com/cmsis/ for full details

Dependents:   K22F_DSP_Matrix_least_square BNO055-ELEC3810 1BNO055 ECE4180Project--Slave2 ... more

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_correlate_q7.c Source File

arm_correlate_q7.c

00001 /* ----------------------------------------------------------------------  
00002 * Copyright (C) 2010 ARM Limited. All rights reserved.  
00003 *  
00004 * $Date:        29. November 2010  
00005 * $Revision:    V1.0.3  
00006 *  
00007 * Project:      CMSIS DSP Library  
00008 * Title:        arm_correlate_q7.c  
00009 *  
00010 * Description:  Process function for Q7 Correlation.  
00011 *  
00012 * Target Processor: Cortex-M4/Cortex-M3
00013 *  
00014 * Version 1.0.3 2010/11/29 
00015 *    Re-organized the CMSIS folders and updated documentation.  
00016 *   
00017 * Version 1.0.2 2010/11/11  
00018 *    Documentation updated.   
00019 *  
00020 * Version 1.0.1 2010/10/05   
00021 *    Production release and review comments incorporated.  
00022 *  
00023 * Version 1.0.0 2010/09/20   
00024 *    Production release and review comments incorporated  
00025 *  
00026 * Version 0.0.7  2010/06/10   
00027 *    Misra-C changes done  
00028 *  
00029 * -------------------------------------------------------------------- */ 
00030  
00031 #include "arm_math.h" 
00032  
00033 /**  
00034  * @ingroup groupFilters  
00035  */ 
00036  
00037 /**  
00038  * @addtogroup Corr  
00039  * @{  
00040  */ 
00041  
00042 /**  
00043  * @brief Correlation of Q7 sequences.  
00044  * @param[in] *pSrcA points to the first input sequence.  
00045  * @param[in] srcALen length of the first input sequence.  
00046  * @param[in] *pSrcB points to the second input sequence.  
00047  * @param[in] srcBLen length of the second input sequence.  
00048  * @param[out] *pDst points to the location where the output result is written.  Length 2 * max(srcALen, srcBLen) - 1.  
00049  * @return none.  
00050  *  
00051  * @details  
00052  * <b>Scaling and Overflow Behavior:</b>  
00053  *  
00054  * \par  
00055  * The function is implemented using a 32-bit internal accumulator.  
00056  * Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result.  
00057  * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.  
00058  * This approach provides 17 guard bits and there is no risk of overflow as long as <code>max(srcALen, srcBLen)<131072</code>.  
00059  * The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and saturated to 1.7 format.  
00060  */ 
00061  
00062 void arm_correlate_q7( 
00063   q7_t * pSrcA, 
00064   uint32_t srcALen, 
00065   q7_t * pSrcB, 
00066   uint32_t srcBLen, 
00067   q7_t * pDst) 
00068 { 
00069   q7_t *pIn1;                                    /* inputA pointer               */ 
00070   q7_t *pIn2;                                    /* inputB pointer               */ 
00071   q7_t *pOut = pDst;                             /* output pointer               */ 
00072   q7_t *px;                                      /* Intermediate inputA pointer  */ 
00073   q7_t *py;                                      /* Intermediate inputB pointer  */ 
00074   q7_t *pSrc1;                                   /* Intermediate pointers        */ 
00075   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulators                  */ 
00076   q31_t input1, input2;                          /* temporary variables */ 
00077   q15_t in1, in2;                                /* temporary variables */ 
00078   q7_t x0, x1, x2, x3, c0, c1;                   /* temporary variables for holding input and coefficient values */ 
00079   uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3;  /* loop counter                 */ 
00080   int32_t inc = 1; 
00081  
00082  
00083   /* The algorithm implementation is based on the lengths of the inputs. */ 
00084   /* srcB is always made to slide across srcA. */ 
00085   /* So srcBLen is always considered as shorter or equal to srcALen */ 
00086   /* But CORR(x, y) is reverse of CORR(y, x) */ 
00087   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ 
00088   /* and the destination pointer modifier, inc is set to -1 */ 
00089   /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */ 
00090   /* But to improve the performance,  
00091    * we include zeroes in the output instead of zero padding either of the the inputs*/ 
00092   /* If srcALen > srcBLen,  
00093    * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */ 
00094   /* If srcALen < srcBLen,  
00095    * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */ 
00096   if(srcALen >= srcBLen) 
00097   { 
00098     /* Initialization of inputA pointer */ 
00099     pIn1 = (pSrcA); 
00100  
00101     /* Initialization of inputB pointer */ 
00102     pIn2 = (pSrcB); 
00103  
00104     /* Number of output samples is calculated */ 
00105     outBlockSize = (2u * srcALen) - 1u; 
00106  
00107     /* When srcALen > srcBLen, zero padding is done to srcB  
00108      * to make their lengths equal.  
00109      * Instead, (outBlockSize - (srcALen + srcBLen - 1))  
00110      * number of output samples are made zero */ 
00111     j = outBlockSize - (srcALen + (srcBLen - 1u)); 
00112  
00113     while(j > 0u) 
00114     { 
00115       /* Zero is stored in the destination buffer */ 
00116       *pOut++ = 0; 
00117  
00118       /* Decrement the loop counter */ 
00119       j--; 
00120     } 
00121  
00122   } 
00123   else 
00124   { 
00125     /* Initialization of inputA pointer */ 
00126     pIn1 = (pSrcB); 
00127  
00128     /* Initialization of inputB pointer */ 
00129     pIn2 = (pSrcA); 
00130  
00131     /* srcBLen is always considered as shorter or equal to srcALen */ 
00132     j = srcBLen; 
00133     srcBLen = srcALen; 
00134     srcALen = j; 
00135  
00136     /* CORR(x, y) = Reverse order(CORR(y, x)) */ 
00137     /* Hence set the destination pointer to point to the last output sample */ 
00138     pOut = pDst + ((srcALen + srcBLen) - 2u); 
00139  
00140     /* Destination address modifier is set to -1 */ 
00141     inc = -1; 
00142  
00143   } 
00144  
00145   /* The function is internally  
00146    * divided into three parts according to the number of multiplications that has to be  
00147    * taken place between inputA samples and inputB samples. In the first part of the  
00148    * algorithm, the multiplications increase by one for every iteration.  
00149    * In the second part of the algorithm, srcBLen number of multiplications are done.  
00150    * In the third part of the algorithm, the multiplications decrease by one  
00151    * for every iteration.*/ 
00152   /* The algorithm is implemented in three stages.  
00153    * The loop counters of each stage is initiated here. */ 
00154   blockSize1 = srcBLen - 1u; 
00155   blockSize2 = srcALen - (srcBLen - 1u); 
00156   blockSize3 = blockSize1; 
00157  
00158   /* --------------------------  
00159    * Initializations of stage1  
00160    * -------------------------*/ 
00161  
00162   /* sum = x[0] * y[srcBlen - 1]  
00163    * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]  
00164    * ....  
00165    * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]  
00166    */ 
00167  
00168   /* In this stage the MAC operations are increased by 1 for every iteration.  
00169      The count variable holds the number of MAC operations performed */ 
00170   count = 1u; 
00171  
00172   /* Working pointer of inputA */ 
00173   px = pIn1; 
00174  
00175   /* Working pointer of inputB */ 
00176   pSrc1 = pIn2 + (srcBLen - 1u); 
00177   py = pSrc1; 
00178  
00179   /* ------------------------  
00180    * Stage1 process  
00181    * ----------------------*/ 
00182  
00183   /* The first stage starts here */ 
00184   while(blockSize1 > 0u) 
00185   { 
00186     /* Accumulator is made zero for every iteration */ 
00187     sum = 0; 
00188  
00189     /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00190     k = count >> 2; 
00191  
00192     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00193      ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00194     while(k > 0u) 
00195     { 
00196       /* x[0] , x[1] */ 
00197       in1 = (q15_t) * px++; 
00198       in2 = (q15_t) * px++; 
00199       input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 
00200  
00201       /* y[srcBLen - 4] , y[srcBLen - 3] */ 
00202       in1 = (q15_t) * py++; 
00203       in2 = (q15_t) * py++; 
00204       input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 
00205  
00206       /* x[0] * y[srcBLen - 4] */ 
00207       /* x[1] * y[srcBLen - 3] */ 
00208       sum = __SMLAD(input1, input2, sum); 
00209  
00210       /* x[2] , x[3] */ 
00211       in1 = (q15_t) * px++; 
00212       in2 = (q15_t) * px++; 
00213       input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 
00214  
00215       /* y[srcBLen - 2] , y[srcBLen - 1] */ 
00216       in1 = (q15_t) * py++; 
00217       in2 = (q15_t) * py++; 
00218       input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 
00219  
00220       /* x[2] * y[srcBLen - 2] */ 
00221       /* x[3] * y[srcBLen - 1] */ 
00222       sum = __SMLAD(input1, input2, sum); 
00223  
00224  
00225       /* Decrement the loop counter */ 
00226       k--; 
00227     } 
00228  
00229     /* If the count is not a multiple of 4, compute any remaining MACs here.  
00230      ** No loop unrolling is used. */ 
00231     k = count % 0x4u; 
00232  
00233     while(k > 0u) 
00234     { 
00235       /* Perform the multiply-accumulates */ 
00236       /* x[0] * y[srcBLen - 1] */ 
00237       sum += (q31_t) ((q15_t) * px++ * *py++); 
00238  
00239       /* Decrement the loop counter */ 
00240       k--; 
00241     } 
00242  
00243     /* Store the result in the accumulator in the destination buffer. */ 
00244     *pOut = (q7_t) (__SSAT(sum >> 7, 8)); 
00245     /* Destination pointer is updated according to the address modifier, inc */ 
00246     pOut += inc; 
00247  
00248     /* Update the inputA and inputB pointers for next MAC calculation */ 
00249     py = pSrc1 - count; 
00250     px = pIn1; 
00251  
00252     /* Increment the MAC count */ 
00253     count++; 
00254  
00255     /* Decrement the loop counter */ 
00256     blockSize1--; 
00257   } 
00258  
00259   /* --------------------------  
00260    * Initializations of stage2  
00261    * ------------------------*/ 
00262  
00263   /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]  
00264    * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]  
00265    * ....  
00266    * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]  
00267    */ 
00268  
00269   /* Working pointer of inputA */ 
00270   px = pIn1; 
00271  
00272   /* Working pointer of inputB */ 
00273   py = pIn2; 
00274  
00275   /* count is index by which the pointer pIn1 to be incremented */ 
00276   count = 1u; 
00277  
00278   /* -------------------  
00279    * Stage2 process  
00280    * ------------------*/ 
00281  
00282   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.  
00283    * So, to loop unroll over blockSize2,  
00284    * srcBLen should be greater than or equal to 4 */ 
00285   if(srcBLen >= 4u) 
00286   { 
00287     /* Loop unroll over blockSize2, by 4 */ 
00288     blkCnt = blockSize2 >> 2u; 
00289  
00290     while(blkCnt > 0u) 
00291     { 
00292       /* Set all accumulators to zero */ 
00293       acc0 = 0; 
00294       acc1 = 0; 
00295       acc2 = 0; 
00296       acc3 = 0; 
00297  
00298       /* read x[0], x[1], x[2] samples */ 
00299       x0 = *px++; 
00300       x1 = *px++; 
00301       x2 = *px++; 
00302  
00303       /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00304       k = srcBLen >> 2u; 
00305  
00306       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00307        ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00308       do 
00309       { 
00310         /* Read y[0] sample */ 
00311         c0 = *py++; 
00312         /* Read y[1] sample */ 
00313         c1 = *py++; 
00314  
00315         /* Read x[3] sample */ 
00316         x3 = *px++; 
00317  
00318         /* x[0] and x[1] are packed */ 
00319         in1 = (q15_t) x0; 
00320         in2 = (q15_t) x1; 
00321  
00322         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 
00323  
00324         /* y[0] and y[1] are packed */ 
00325         in1 = (q15_t) c0; 
00326         in2 = (q15_t) c1; 
00327  
00328         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 
00329  
00330         /* acc0 += x[0] * y[0] + x[1] * y[1]  */ 
00331         acc0 = __SMLAD(input1, input2, acc0); 
00332  
00333         /* x[1] and x[2] are packed */ 
00334         in1 = (q15_t) x1; 
00335         in2 = (q15_t) x2; 
00336  
00337         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 
00338  
00339         /* acc1 += x[1] * y[0] + x[2] * y[1] */ 
00340         acc1 = __SMLAD(input1, input2, acc1); 
00341  
00342         /* x[2] and x[3] are packed */ 
00343         in1 = (q15_t) x2; 
00344         in2 = (q15_t) x3; 
00345  
00346         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 
00347  
00348         /* acc2 += x[2] * y[0] + x[3] * y[1]  */ 
00349         acc2 = __SMLAD(input1, input2, acc2); 
00350  
00351         /* Read x[4] sample */ 
00352         x0 = *(px++); 
00353  
00354         /* x[3] and x[4] are packed */ 
00355         in1 = (q15_t) x3; 
00356         in2 = (q15_t) x0; 
00357  
00358         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 
00359  
00360         /* acc3 += x[3] * y[0] + x[4] * y[1]  */ 
00361         acc3 = __SMLAD(input1, input2, acc3); 
00362  
00363         /* Read y[2] sample */ 
00364         c0 = *py++; 
00365         /* Read y[3] sample */ 
00366         c1 = *py++; 
00367  
00368         /* Read x[5] sample */ 
00369         x1 = *px++; 
00370  
00371         /* x[2] and x[3] are packed */ 
00372         in1 = (q15_t) x2; 
00373         in2 = (q15_t) x3; 
00374  
00375         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 
00376  
00377         /* y[2] and y[3] are packed */ 
00378         in1 = (q15_t) c0; 
00379         in2 = (q15_t) c1; 
00380  
00381         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 
00382  
00383         /* acc0 += x[2] * y[2] + x[3] * y[3]  */ 
00384         acc0 = __SMLAD(input1, input2, acc0); 
00385  
00386         /* x[3] and x[4] are packed */ 
00387         in1 = (q15_t) x3; 
00388         in2 = (q15_t) x0; 
00389  
00390         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 
00391  
00392         /* acc1 += x[3] * y[2] + x[4] * y[3]  */ 
00393         acc1 = __SMLAD(input1, input2, acc1); 
00394  
00395         /* x[4] and x[5] are packed */ 
00396         in1 = (q15_t) x0; 
00397         in2 = (q15_t) x1; 
00398  
00399         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 
00400  
00401         /* acc2 += x[4] * y[2] + x[5] * y[3]  */ 
00402         acc2 = __SMLAD(input1, input2, acc2); 
00403  
00404         /* Read x[6] sample */ 
00405         x2 = *px++; 
00406  
00407         /* x[5] and x[6] are packed */ 
00408         in1 = (q15_t) x1; 
00409         in2 = (q15_t) x2; 
00410  
00411         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 
00412  
00413         /* acc3 += x[5] * y[2] + x[6] * y[3]  */ 
00414         acc3 = __SMLAD(input1, input2, acc3); 
00415  
00416       } while(--k); 
00417  
00418       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.  
00419        ** No loop unrolling is used. */ 
00420       k = srcBLen % 0x4u; 
00421  
00422       while(k > 0u) 
00423       { 
00424         /* Read y[4] sample */ 
00425         c0 = *py++; 
00426  
00427         /* Read x[7] sample */ 
00428         x3 = *px++; 
00429  
00430         /* Perform the multiply-accumulates */ 
00431         /* acc0 +=  x[4] * y[4] */ 
00432         acc0 += ((q15_t) x0 * c0); 
00433         /* acc1 +=  x[5] * y[4] */ 
00434         acc1 += ((q15_t) x1 * c0); 
00435         /* acc2 +=  x[6] * y[4] */ 
00436         acc2 += ((q15_t) x2 * c0); 
00437         /* acc3 +=  x[7] * y[4] */ 
00438         acc3 += ((q15_t) x3 * c0); 
00439  
00440         /* Reuse the present samples for the next MAC */ 
00441         x0 = x1; 
00442         x1 = x2; 
00443         x2 = x3; 
00444  
00445         /* Decrement the loop counter */ 
00446         k--; 
00447       } 
00448  
00449       /* Store the result in the accumulator in the destination buffer. */ 
00450       *pOut = (q7_t) (__SSAT(acc0 >> 7, 8)); 
00451       /* Destination pointer is updated according to the address modifier, inc */ 
00452       pOut += inc; 
00453  
00454       *pOut = (q7_t) (__SSAT(acc1 >> 7, 8)); 
00455       pOut += inc; 
00456  
00457       *pOut = (q7_t) (__SSAT(acc2 >> 7, 8)); 
00458       pOut += inc; 
00459  
00460       *pOut = (q7_t) (__SSAT(acc3 >> 7, 8)); 
00461       pOut += inc; 
00462  
00463       /* Update the inputA and inputB pointers for next MAC calculation */ 
00464       px = pIn1 + (count * 4u); 
00465       py = pIn2; 
00466  
00467       /* Increment the pointer pIn1 index, count by 1 */ 
00468       count++; 
00469  
00470       /* Decrement the loop counter */ 
00471       blkCnt--; 
00472     } 
00473  
00474     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.  
00475      ** No loop unrolling is used. */ 
00476     blkCnt = blockSize2 % 0x4u; 
00477  
00478     while(blkCnt > 0u) 
00479     { 
00480       /* Accumulator is made zero for every iteration */ 
00481       sum = 0; 
00482  
00483       /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00484       k = srcBLen >> 2u; 
00485  
00486       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00487        ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00488       while(k > 0u) 
00489       { 
00490         /* Reading two inputs of SrcA buffer and packing */ 
00491         in1 = (q15_t) * px++; 
00492         in2 = (q15_t) * px++; 
00493         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 
00494  
00495         /* Reading two inputs of SrcB buffer and packing */ 
00496         in1 = (q15_t) * py++; 
00497         in2 = (q15_t) * py++; 
00498         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 
00499  
00500         /* Perform the multiply-accumulates */ 
00501         sum = __SMLAD(input1, input2, sum); 
00502  
00503         /* Reading two inputs of SrcA buffer and packing */ 
00504         in1 = (q15_t) * px++; 
00505         in2 = (q15_t) * px++; 
00506         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 
00507  
00508         /* Reading two inputs of SrcB buffer and packing */ 
00509         in1 = (q15_t) * py++; 
00510         in2 = (q15_t) * py++; 
00511         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 
00512  
00513         /* Perform the multiply-accumulates */ 
00514         sum = __SMLAD(input1, input2, sum); 
00515  
00516         /* Decrement the loop counter */ 
00517         k--; 
00518       } 
00519  
00520       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.  
00521        ** No loop unrolling is used. */ 
00522       k = srcBLen % 0x4u; 
00523  
00524       while(k > 0u) 
00525       { 
00526         /* Perform the multiply-accumulates */ 
00527         sum += ((q15_t) * px++ * *py++); 
00528  
00529         /* Decrement the loop counter */ 
00530         k--; 
00531       } 
00532  
00533       /* Store the result in the accumulator in the destination buffer. */ 
00534       *pOut = (q7_t) (__SSAT(sum >> 7, 8)); 
00535       /* Destination pointer is updated according to the address modifier, inc */ 
00536       pOut += inc; 
00537  
00538       /* Update the inputA and inputB pointers for next MAC calculation */ 
00539       px = pIn1 + count; 
00540       py = pIn2; 
00541  
00542       /* Increment the pointer pIn1 index, count by 1 */ 
00543       count++; 
00544  
00545       /* Decrement the loop counter */ 
00546       blkCnt--; 
00547     } 
00548   } 
00549   else 
00550   { 
00551     /* If the srcBLen is not a multiple of 4,  
00552      * the blockSize2 loop cannot be unrolled by 4 */ 
00553     blkCnt = blockSize2; 
00554  
00555     while(blkCnt > 0u) 
00556     { 
00557       /* Accumulator is made zero for every iteration */ 
00558       sum = 0; 
00559  
00560       /* Loop over srcBLen */ 
00561       k = srcBLen; 
00562  
00563       while(k > 0u) 
00564       { 
00565         /* Perform the multiply-accumulate */ 
00566         sum += ((q15_t) * px++ * *py++); 
00567  
00568         /* Decrement the loop counter */ 
00569         k--; 
00570       } 
00571  
00572       /* Store the result in the accumulator in the destination buffer. */ 
00573       *pOut = (q7_t) (__SSAT(sum >> 7, 8)); 
00574       /* Destination pointer is updated according to the address modifier, inc */ 
00575       pOut += inc; 
00576  
00577       /* Update the inputA and inputB pointers for next MAC calculation */ 
00578       px = pIn1 + count; 
00579       py = pIn2; 
00580  
00581       /* Increment the MAC count */ 
00582       count++; 
00583  
00584       /* Decrement the loop counter */ 
00585       blkCnt--; 
00586     } 
00587   } 
00588  
00589   /* --------------------------  
00590    * Initializations of stage3  
00591    * -------------------------*/ 
00592  
00593   /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]  
00594    * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]  
00595    * ....  
00596    * sum +=  x[srcALen-2] * y[0] + x[srcALen-1] * y[1]  
00597    * sum +=  x[srcALen-1] * y[0]  
00598    */ 
00599  
00600   /* In this stage the MAC operations are decreased by 1 for every iteration.  
00601      The count variable holds the number of MAC operations performed */ 
00602   count = srcBLen - 1u; 
00603  
00604   /* Working pointer of inputA */ 
00605   pSrc1 = pIn1 + (srcALen - (srcBLen - 1u)); 
00606   px = pSrc1; 
00607  
00608   /* Working pointer of inputB */ 
00609   py = pIn2; 
00610  
00611   /* -------------------  
00612    * Stage3 process  
00613    * ------------------*/ 
00614  
00615   while(blockSize3 > 0u) 
00616   { 
00617     /* Accumulator is made zero for every iteration */ 
00618     sum = 0; 
00619  
00620     /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00621     k = count >> 2u; 
00622  
00623     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00624      ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00625     while(k > 0u) 
00626     { 
00627       /* x[srcALen - srcBLen + 1] , x[srcALen - srcBLen + 2]  */ 
00628       in1 = (q15_t) * px++; 
00629       in2 = (q15_t) * px++; 
00630       input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 
00631  
00632       /* y[0] , y[1] */ 
00633       in1 = (q15_t) * py++; 
00634       in2 = (q15_t) * py++; 
00635       input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 
00636  
00637       /* sum += x[srcALen - srcBLen + 1] * y[0] */ 
00638       /* sum += x[srcALen - srcBLen + 2] * y[1] */ 
00639       sum = __SMLAD(input1, input2, sum); 
00640  
00641       /* x[srcALen - srcBLen + 3] , x[srcALen - srcBLen + 4] */ 
00642       in1 = (q15_t) * px++; 
00643       in2 = (q15_t) * px++; 
00644       input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 
00645  
00646       /* y[2] , y[3] */ 
00647       in1 = (q15_t) * py++; 
00648       in2 = (q15_t) * py++; 
00649       input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 
00650  
00651       /* sum += x[srcALen - srcBLen + 3] * y[2] */ 
00652       /* sum += x[srcALen - srcBLen + 4] * y[3] */ 
00653       sum = __SMLAD(input1, input2, sum); 
00654  
00655       /* Decrement the loop counter */ 
00656       k--; 
00657     } 
00658  
00659     /* If the count is not a multiple of 4, compute any remaining MACs here.  
00660      ** No loop unrolling is used. */ 
00661     k = count % 0x4u; 
00662  
00663     while(k > 0u) 
00664     { 
00665       /* Perform the multiply-accumulates */ 
00666       sum += ((q15_t) * px++ * *py++); 
00667  
00668       /* Decrement the loop counter */ 
00669       k--; 
00670     } 
00671  
00672     /* Store the result in the accumulator in the destination buffer. */ 
00673     *pOut = (q7_t) (__SSAT(sum >> 7, 8)); 
00674     /* Destination pointer is updated according to the address modifier, inc */ 
00675     pOut += inc; 
00676  
00677     /* Update the inputA and inputB pointers for next MAC calculation */ 
00678     px = ++pSrc1; 
00679     py = pIn2; 
00680  
00681     /* Decrement the MAC count */ 
00682     count--; 
00683  
00684     /* Decrement the loop counter */ 
00685     blockSize3--; 
00686   } 
00687  
00688 } 
00689  
00690 /**  
00691  * @} end of Corr group  
00692  */