CMSIS DSP library

Dependents:   KL25Z_FFT_Demo Hat_Board_v5_1 KL25Z_FFT_Demo_tony KL25Z_FFT_Demo_tony ... more

Fork of mbed-dsp by mbed official

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_correlate_q31.c Source File

arm_correlate_q31.c

00001 /* ----------------------------------------------------------------------    
00002 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.    
00003 *    
00004 * $Date:        17. January 2013
00005 * $Revision:    V1.4.1
00006 *    
00007 * Project:      CMSIS DSP Library    
00008 * Title:        arm_correlate_q31.c    
00009 *    
00010 * Description:  Correlation of Q31 sequences.  
00011 *    
00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
00013 *  
00014 * Redistribution and use in source and binary forms, with or without 
00015 * modification, are permitted provided that the following conditions
00016 * are met:
00017 *   - Redistributions of source code must retain the above copyright
00018 *     notice, this list of conditions and the following disclaimer.
00019 *   - Redistributions in binary form must reproduce the above copyright
00020 *     notice, this list of conditions and the following disclaimer in
00021 *     the documentation and/or other materials provided with the 
00022 *     distribution.
00023 *   - Neither the name of ARM LIMITED nor the names of its contributors
00024 *     may be used to endorse or promote products derived from this
00025 *     software without specific prior written permission.
00026 *
00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00038 * POSSIBILITY OF SUCH DAMAGE.  
00039 * -------------------------------------------------------------------- */
00040 
00041 #include "arm_math.h"
00042 
00043 /**    
00044  * @ingroup groupFilters    
00045  */
00046 
00047 /**    
00048  * @addtogroup Corr    
00049  * @{    
00050  */
00051 
00052 /**    
00053  * @brief Correlation of Q31 sequences.    
00054  * @param[in] *pSrcA points to the first input sequence.    
00055  * @param[in] srcALen length of the first input sequence.    
00056  * @param[in] *pSrcB points to the second input sequence.    
00057  * @param[in] srcBLen length of the second input sequence.    
00058  * @param[out] *pDst points to the location where the output result is written.  Length 2 * max(srcALen, srcBLen) - 1.    
00059  * @return none.    
00060  *    
00061  * @details    
00062  * <b>Scaling and Overflow Behavior:</b>    
00063  *    
00064  * \par    
00065  * The function is implemented using an internal 64-bit accumulator.    
00066  * The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.    
00067  * There is no saturation on intermediate additions.    
00068  * Thus, if the accumulator overflows it wraps around and distorts the result.    
00069  * The input signals should be scaled down to avoid intermediate overflows.    
00070  * Scale down one of the inputs by 1/min(srcALen, srcBLen)to avoid overflows since a    
00071  * maximum of min(srcALen, srcBLen) number of additions is carried internally.    
00072  * The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.    
00073  *    
00074  * \par    
00075  * See <code>arm_correlate_fast_q31()</code> for a faster but less precise implementation of this function for Cortex-M3 and Cortex-M4.    
00076  */
00077 
00078 void arm_correlate_q31(
00079   q31_t * pSrcA,
00080   uint32_t srcALen,
00081   q31_t * pSrcB,
00082   uint32_t srcBLen,
00083   q31_t * pDst)
00084 {
00085 
00086 #ifndef ARM_MATH_CM0_FAMILY
00087 
00088   /* Run the below code for Cortex-M4 and Cortex-M3 */
00089 
00090   q31_t *pIn1;                                   /* inputA pointer               */
00091   q31_t *pIn2;                                   /* inputB pointer               */
00092   q31_t *pOut = pDst;                            /* output pointer               */
00093   q31_t *px;                                     /* Intermediate inputA pointer  */
00094   q31_t *py;                                     /* Intermediate inputB pointer  */
00095   q31_t *pSrc1;                                  /* Intermediate pointers        */
00096   q63_t sum, acc0, acc1, acc2;                   /* Accumulators                  */
00097   q31_t x0, x1, x2, c0;                          /* temporary variables for holding input and coefficient values */
00098   uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3;  /* loop counter                 */
00099   int32_t inc = 1;                               /* Destination address modifier */
00100 
00101 
00102   /* The algorithm implementation is based on the lengths of the inputs. */
00103   /* srcB is always made to slide across srcA. */
00104   /* So srcBLen is always considered as shorter or equal to srcALen */
00105   /* But CORR(x, y) is reverse of CORR(y, x) */
00106   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
00107   /* and the destination pointer modifier, inc is set to -1 */
00108   /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
00109   /* But to improve the performance,    
00110    * we include zeroes in the output instead of zero padding either of the the inputs*/
00111   /* If srcALen > srcBLen,    
00112    * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
00113   /* If srcALen < srcBLen,    
00114    * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
00115   if(srcALen >= srcBLen)
00116   {
00117     /* Initialization of inputA pointer */
00118     pIn1 = (pSrcA);
00119 
00120     /* Initialization of inputB pointer */
00121     pIn2 = (pSrcB);
00122 
00123     /* Number of output samples is calculated */
00124     outBlockSize = (2u * srcALen) - 1u;
00125 
00126     /* When srcALen > srcBLen, zero padding is done to srcB    
00127      * to make their lengths equal.    
00128      * Instead, (outBlockSize - (srcALen + srcBLen - 1))    
00129      * number of output samples are made zero */
00130     j = outBlockSize - (srcALen + (srcBLen - 1u));
00131 
00132     /* Updating the pointer position to non zero value */
00133     pOut += j;
00134 
00135   }
00136   else
00137   {
00138     /* Initialization of inputA pointer */
00139     pIn1 = (pSrcB);
00140 
00141     /* Initialization of inputB pointer */
00142     pIn2 = (pSrcA);
00143 
00144     /* srcBLen is always considered as shorter or equal to srcALen */
00145     j = srcBLen;
00146     srcBLen = srcALen;
00147     srcALen = j;
00148 
00149     /* CORR(x, y) = Reverse order(CORR(y, x)) */
00150     /* Hence set the destination pointer to point to the last output sample */
00151     pOut = pDst + ((srcALen + srcBLen) - 2u);
00152 
00153     /* Destination address modifier is set to -1 */
00154     inc = -1;
00155 
00156   }
00157 
00158   /* The function is internally    
00159    * divided into three parts according to the number of multiplications that has to be    
00160    * taken place between inputA samples and inputB samples. In the first part of the    
00161    * algorithm, the multiplications increase by one for every iteration.    
00162    * In the second part of the algorithm, srcBLen number of multiplications are done.    
00163    * In the third part of the algorithm, the multiplications decrease by one    
00164    * for every iteration.*/
00165   /* The algorithm is implemented in three stages.    
00166    * The loop counters of each stage is initiated here. */
00167   blockSize1 = srcBLen - 1u;
00168   blockSize2 = srcALen - (srcBLen - 1u);
00169   blockSize3 = blockSize1;
00170 
00171   /* --------------------------    
00172    * Initializations of stage1    
00173    * -------------------------*/
00174 
00175   /* sum = x[0] * y[srcBlen - 1]    
00176    * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]    
00177    * ....    
00178    * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]    
00179    */
00180 
00181   /* In this stage the MAC operations are increased by 1 for every iteration.    
00182      The count variable holds the number of MAC operations performed */
00183   count = 1u;
00184 
00185   /* Working pointer of inputA */
00186   px = pIn1;
00187 
00188   /* Working pointer of inputB */
00189   pSrc1 = pIn2 + (srcBLen - 1u);
00190   py = pSrc1;
00191 
00192   /* ------------------------    
00193    * Stage1 process    
00194    * ----------------------*/
00195 
00196   /* The first stage starts here */
00197   while(blockSize1 > 0u)
00198   {
00199     /* Accumulator is made zero for every iteration */
00200     sum = 0;
00201 
00202     /* Apply loop unrolling and compute 4 MACs simultaneously. */
00203     k = count >> 2;
00204 
00205     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.    
00206      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00207     while(k > 0u)
00208     {
00209       /* x[0] * y[srcBLen - 4] */
00210       sum += (q63_t) * px++ * (*py++);
00211       /* x[1] * y[srcBLen - 3] */
00212       sum += (q63_t) * px++ * (*py++);
00213       /* x[2] * y[srcBLen - 2] */
00214       sum += (q63_t) * px++ * (*py++);
00215       /* x[3] * y[srcBLen - 1] */
00216       sum += (q63_t) * px++ * (*py++);
00217 
00218       /* Decrement the loop counter */
00219       k--;
00220     }
00221 
00222     /* If the count is not a multiple of 4, compute any remaining MACs here.    
00223      ** No loop unrolling is used. */
00224     k = count % 0x4u;
00225 
00226     while(k > 0u)
00227     {
00228       /* Perform the multiply-accumulates */
00229       /* x[0] * y[srcBLen - 1] */
00230       sum += (q63_t) * px++ * (*py++);
00231 
00232       /* Decrement the loop counter */
00233       k--;
00234     }
00235 
00236     /* Store the result in the accumulator in the destination buffer. */
00237     *pOut = (q31_t) (sum >> 31);
00238     /* Destination pointer is updated according to the address modifier, inc */
00239     pOut += inc;
00240 
00241     /* Update the inputA and inputB pointers for next MAC calculation */
00242     py = pSrc1 - count;
00243     px = pIn1;
00244 
00245     /* Increment the MAC count */
00246     count++;
00247 
00248     /* Decrement the loop counter */
00249     blockSize1--;
00250   }
00251 
00252   /* --------------------------    
00253    * Initializations of stage2    
00254    * ------------------------*/
00255 
00256   /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]    
00257    * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]    
00258    * ....    
00259    * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]    
00260    */
00261 
00262   /* Working pointer of inputA */
00263   px = pIn1;
00264 
00265   /* Working pointer of inputB */
00266   py = pIn2;
00267 
00268   /* count is index by which the pointer pIn1 to be incremented */
00269   count = 0u;
00270 
00271   /* -------------------    
00272    * Stage2 process    
00273    * ------------------*/
00274 
00275   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.    
00276    * So, to loop unroll over blockSize2,    
00277    * srcBLen should be greater than or equal to 4 */
00278   if(srcBLen >= 4u)
00279   {
00280     /* Loop unroll by 3 */
00281     blkCnt = blockSize2 / 3;
00282 
00283     while(blkCnt > 0u)
00284     {
00285       /* Set all accumulators to zero */
00286       acc0 = 0;
00287       acc1 = 0;
00288       acc2 = 0;
00289 
00290       /* read x[0], x[1] samples */
00291       x0 = *(px++);
00292       x1 = *(px++);
00293 
00294       /* Apply loop unrolling and compute 3 MACs simultaneously. */
00295       k = srcBLen / 3;
00296 
00297       /* First part of the processing with loop unrolling.  Compute 3 MACs at a time.        
00298        ** a second loop below computes MACs for the remaining 1 to 2 samples. */
00299       do
00300       {
00301         /* Read y[0] sample */
00302         c0 = *(py);
00303 
00304         /* Read x[2] sample */
00305         x2 = *(px);
00306 
00307         /* Perform the multiply-accumulate */
00308         /* acc0 +=  x[0] * y[0] */
00309         acc0 += ((q63_t) x0 * c0);
00310         /* acc1 +=  x[1] * y[0] */
00311         acc1 += ((q63_t) x1 * c0);
00312         /* acc2 +=  x[2] * y[0] */
00313         acc2 += ((q63_t) x2 * c0);
00314 
00315         /* Read y[1] sample */
00316         c0 = *(py + 1u);
00317 
00318         /* Read x[3] sample */
00319         x0 = *(px + 1u);
00320 
00321         /* Perform the multiply-accumulates */
00322         /* acc0 +=  x[1] * y[1] */
00323         acc0 += ((q63_t) x1 * c0);
00324         /* acc1 +=  x[2] * y[1] */
00325         acc1 += ((q63_t) x2 * c0);
00326         /* acc2 +=  x[3] * y[1] */
00327         acc2 += ((q63_t) x0 * c0);
00328 
00329         /* Read y[2] sample */
00330         c0 = *(py + 2u);
00331 
00332         /* Read x[4] sample */
00333         x1 = *(px + 2u);
00334 
00335         /* Perform the multiply-accumulates */
00336         /* acc0 +=  x[2] * y[2] */
00337         acc0 += ((q63_t) x2 * c0);
00338         /* acc1 +=  x[3] * y[2] */
00339         acc1 += ((q63_t) x0 * c0);
00340         /* acc2 +=  x[4] * y[2] */
00341         acc2 += ((q63_t) x1 * c0);
00342 
00343         /* update scratch pointers */
00344         px += 3u;
00345         py += 3u;
00346 
00347       } while(--k);
00348 
00349       /* If the srcBLen is not a multiple of 3, compute any remaining MACs here.        
00350        ** No loop unrolling is used. */
00351       k = srcBLen - (3 * (srcBLen / 3));
00352 
00353       while(k > 0u)
00354       {
00355         /* Read y[4] sample */
00356         c0 = *(py++);
00357 
00358         /* Read x[7] sample */
00359         x2 = *(px++);
00360 
00361         /* Perform the multiply-accumulates */
00362         /* acc0 +=  x[4] * y[4] */
00363         acc0 += ((q63_t) x0 * c0);
00364         /* acc1 +=  x[5] * y[4] */
00365         acc1 += ((q63_t) x1 * c0);
00366         /* acc2 +=  x[6] * y[4] */
00367         acc2 += ((q63_t) x2 * c0);
00368 
00369         /* Reuse the present samples for the next MAC */
00370         x0 = x1;
00371         x1 = x2;
00372 
00373         /* Decrement the loop counter */
00374         k--;
00375       }
00376 
00377       /* Store the result in the accumulator in the destination buffer. */
00378       *pOut = (q31_t) (acc0 >> 31);
00379       /* Destination pointer is updated according to the address modifier, inc */
00380       pOut += inc;
00381 
00382       *pOut = (q31_t) (acc1 >> 31);
00383       pOut += inc;
00384 
00385       *pOut = (q31_t) (acc2 >> 31);
00386       pOut += inc;
00387 
00388       /* Increment the pointer pIn1 index, count by 3 */
00389       count += 3u;
00390 
00391       /* Update the inputA and inputB pointers for next MAC calculation */
00392       px = pIn1 + count;
00393       py = pIn2;
00394 
00395 
00396       /* Decrement the loop counter */
00397       blkCnt--;
00398     }
00399 
00400     /* If the blockSize2 is not a multiple of 3, compute any remaining output samples here.        
00401      ** No loop unrolling is used. */
00402     blkCnt = blockSize2 - 3 * (blockSize2 / 3);
00403 
00404     while(blkCnt > 0u)
00405     {
00406       /* Accumulator is made zero for every iteration */
00407       sum = 0;
00408 
00409       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00410       k = srcBLen >> 2u;
00411 
00412       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.    
00413        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00414       while(k > 0u)
00415       {
00416         /* Perform the multiply-accumulates */
00417         sum += (q63_t) * px++ * (*py++);
00418         sum += (q63_t) * px++ * (*py++);
00419         sum += (q63_t) * px++ * (*py++);
00420         sum += (q63_t) * px++ * (*py++);
00421 
00422         /* Decrement the loop counter */
00423         k--;
00424       }
00425 
00426       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.    
00427        ** No loop unrolling is used. */
00428       k = srcBLen % 0x4u;
00429 
00430       while(k > 0u)
00431       {
00432         /* Perform the multiply-accumulate */
00433         sum += (q63_t) * px++ * (*py++);
00434 
00435         /* Decrement the loop counter */
00436         k--;
00437       }
00438 
00439       /* Store the result in the accumulator in the destination buffer. */
00440       *pOut = (q31_t) (sum >> 31);
00441       /* Destination pointer is updated according to the address modifier, inc */
00442       pOut += inc;
00443 
00444       /* Increment the MAC count */
00445       count++;
00446 
00447       /* Update the inputA and inputB pointers for next MAC calculation */
00448       px = pIn1 + count;
00449       py = pIn2;
00450 
00451       /* Decrement the loop counter */
00452       blkCnt--;
00453     }
00454   }
00455   else
00456   {
00457     /* If the srcBLen is not a multiple of 4,    
00458      * the blockSize2 loop cannot be unrolled by 4 */
00459     blkCnt = blockSize2;
00460 
00461     while(blkCnt > 0u)
00462     {
00463       /* Accumulator is made zero for every iteration */
00464       sum = 0;
00465 
00466       /* Loop over srcBLen */
00467       k = srcBLen;
00468 
00469       while(k > 0u)
00470       {
00471         /* Perform the multiply-accumulate */
00472         sum += (q63_t) * px++ * (*py++);
00473 
00474         /* Decrement the loop counter */
00475         k--;
00476       }
00477 
00478       /* Store the result in the accumulator in the destination buffer. */
00479       *pOut = (q31_t) (sum >> 31);
00480       /* Destination pointer is updated according to the address modifier, inc */
00481       pOut += inc;
00482 
00483       /* Increment the MAC count */
00484       count++;
00485 
00486       /* Update the inputA and inputB pointers for next MAC calculation */
00487       px = pIn1 + count;
00488       py = pIn2;
00489 
00490       /* Decrement the loop counter */
00491       blkCnt--;
00492     }
00493   }
00494 
00495   /* --------------------------    
00496    * Initializations of stage3    
00497    * -------------------------*/
00498 
00499   /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]    
00500    * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]    
00501    * ....    
00502    * sum +=  x[srcALen-2] * y[0] + x[srcALen-1] * y[1]    
00503    * sum +=  x[srcALen-1] * y[0]    
00504    */
00505 
00506   /* In this stage the MAC operations are decreased by 1 for every iteration.    
00507      The count variable holds the number of MAC operations performed */
00508   count = srcBLen - 1u;
00509 
00510   /* Working pointer of inputA */
00511   pSrc1 = pIn1 + (srcALen - (srcBLen - 1u));
00512   px = pSrc1;
00513 
00514   /* Working pointer of inputB */
00515   py = pIn2;
00516 
00517   /* -------------------    
00518    * Stage3 process    
00519    * ------------------*/
00520 
00521   while(blockSize3 > 0u)
00522   {
00523     /* Accumulator is made zero for every iteration */
00524     sum = 0;
00525 
00526     /* Apply loop unrolling and compute 4 MACs simultaneously. */
00527     k = count >> 2u;
00528 
00529     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.    
00530      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00531     while(k > 0u)
00532     {
00533       /* Perform the multiply-accumulates */
00534       /* sum += x[srcALen - srcBLen + 4] * y[3] */
00535       sum += (q63_t) * px++ * (*py++);
00536       /* sum += x[srcALen - srcBLen + 3] * y[2] */
00537       sum += (q63_t) * px++ * (*py++);
00538       /* sum += x[srcALen - srcBLen + 2] * y[1] */
00539       sum += (q63_t) * px++ * (*py++);
00540       /* sum += x[srcALen - srcBLen + 1] * y[0] */
00541       sum += (q63_t) * px++ * (*py++);
00542 
00543       /* Decrement the loop counter */
00544       k--;
00545     }
00546 
00547     /* If the count is not a multiple of 4, compute any remaining MACs here.    
00548      ** No loop unrolling is used. */
00549     k = count % 0x4u;
00550 
00551     while(k > 0u)
00552     {
00553       /* Perform the multiply-accumulates */
00554       sum += (q63_t) * px++ * (*py++);
00555 
00556       /* Decrement the loop counter */
00557       k--;
00558     }
00559 
00560     /* Store the result in the accumulator in the destination buffer. */
00561     *pOut = (q31_t) (sum >> 31);
00562     /* Destination pointer is updated according to the address modifier, inc */
00563     pOut += inc;
00564 
00565     /* Update the inputA and inputB pointers for next MAC calculation */
00566     px = ++pSrc1;
00567     py = pIn2;
00568 
00569     /* Decrement the MAC count */
00570     count--;
00571 
00572     /* Decrement the loop counter */
00573     blockSize3--;
00574   }
00575 
00576 #else
00577 
00578   /* Run the below code for Cortex-M0 */
00579 
00580   q31_t *pIn1 = pSrcA;                           /* inputA pointer               */
00581   q31_t *pIn2 = pSrcB + (srcBLen - 1u);          /* inputB pointer               */
00582   q63_t sum;                                     /* Accumulators                  */
00583   uint32_t i = 0u, j;                            /* loop counters */
00584   uint32_t inv = 0u;                             /* Reverse order flag */
00585   uint32_t tot = 0u;                             /* Length */
00586 
00587   /* The algorithm implementation is based on the lengths of the inputs. */
00588   /* srcB is always made to slide across srcA. */
00589   /* So srcBLen is always considered as shorter or equal to srcALen */
00590   /* But CORR(x, y) is reverse of CORR(y, x) */
00591   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
00592   /* and a varaible, inv is set to 1 */
00593   /* If lengths are not equal then zero pad has to be done to  make the two    
00594    * inputs of same length. But to improve the performance, we include zeroes    
00595    * in the output instead of zero padding either of the the inputs*/
00596   /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the    
00597    * starting of the output buffer */
00598   /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the   
00599    * ending of the output buffer */
00600   /* Once the zero padding is done the remaining of the output is calcualted   
00601    * using correlation but with the shorter signal time shifted. */
00602 
00603   /* Calculate the length of the remaining sequence */
00604   tot = ((srcALen + srcBLen) - 2u);
00605 
00606   if(srcALen > srcBLen)
00607   {
00608     /* Calculating the number of zeros to be padded to the output */
00609     j = srcALen - srcBLen;
00610 
00611     /* Initialise the pointer after zero padding */
00612     pDst += j;
00613   }
00614 
00615   else if(srcALen < srcBLen)
00616   {
00617     /* Initialization to inputB pointer */
00618     pIn1 = pSrcB;
00619 
00620     /* Initialization to the end of inputA pointer */
00621     pIn2 = pSrcA + (srcALen - 1u);
00622 
00623     /* Initialisation of the pointer after zero padding */
00624     pDst = pDst + tot;
00625 
00626     /* Swapping the lengths */
00627     j = srcALen;
00628     srcALen = srcBLen;
00629     srcBLen = j;
00630 
00631     /* Setting the reverse flag */
00632     inv = 1;
00633 
00634   }
00635 
00636   /* Loop to calculate correlation for output length number of times */
00637   for (i = 0u; i <= tot; i++)
00638   {
00639     /* Initialize sum with zero to carry on MAC operations */
00640     sum = 0;
00641 
00642     /* Loop to perform MAC operations according to correlation equation */
00643     for (j = 0u; j <= i; j++)
00644     {
00645       /* Check the array limitations */
00646       if((((i - j) < srcBLen) && (j < srcALen)))
00647       {
00648         /* z[i] += x[i-j] * y[j] */
00649         sum += ((q63_t) pIn1[j] * pIn2[-((int32_t) i - j)]);
00650       }
00651     }
00652     /* Store the output in the destination buffer */
00653     if(inv == 1)
00654       *pDst-- = (q31_t) (sum >> 31u);
00655     else
00656       *pDst++ = (q31_t) (sum >> 31u);
00657   }
00658 
00659 #endif /*   #ifndef ARM_MATH_CM0_FAMILY */
00660 
00661 }
00662 
00663 /**    
00664  * @} end of Corr group    
00665  */