CMSIS DSP library

Dependents:   KL25Z_FFT_Demo Hat_Board_v5_1 KL25Z_FFT_Demo_tony KL25Z_FFT_Demo_tony ... more

Fork of mbed-dsp by mbed official

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_correlate_q15.c Source File

arm_correlate_q15.c

00001 /* ----------------------------------------------------------------------   
00002 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.   
00003 *   
00004 * $Date:        17. January 2013
00005 * $Revision:    V1.4.1
00006 *   
00007 * Project:      CMSIS DSP Library   
00008 * Title:        arm_correlate_q15.c   
00009 *   
00010 * Description:  Correlation of Q15 sequences. 
00011 *   
00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
00013 *  
00014 * Redistribution and use in source and binary forms, with or without 
00015 * modification, are permitted provided that the following conditions
00016 * are met:
00017 *   - Redistributions of source code must retain the above copyright
00018 *     notice, this list of conditions and the following disclaimer.
00019 *   - Redistributions in binary form must reproduce the above copyright
00020 *     notice, this list of conditions and the following disclaimer in
00021 *     the documentation and/or other materials provided with the 
00022 *     distribution.
00023 *   - Neither the name of ARM LIMITED nor the names of its contributors
00024 *     may be used to endorse or promote products derived from this
00025 *     software without specific prior written permission.
00026 *
00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00038 * POSSIBILITY OF SUCH DAMAGE.  
00039 * -------------------------------------------------------------------- */
00040 
00041 #include "arm_math.h"
00042 
00043 /**   
00044  * @ingroup groupFilters   
00045  */
00046 
00047 /**   
00048  * @addtogroup Corr   
00049  * @{   
00050  */
00051 
00052 /**   
00053  * @brief Correlation of Q15 sequences. 
00054  * @param[in] *pSrcA points to the first input sequence.   
00055  * @param[in] srcALen length of the first input sequence.   
00056  * @param[in] *pSrcB points to the second input sequence.   
00057  * @param[in] srcBLen length of the second input sequence.   
00058  * @param[out] *pDst points to the location where the output result is written.  Length 2 * max(srcALen, srcBLen) - 1.   
00059  * @return none.   
00060  *   
00061  * @details   
00062  * <b>Scaling and Overflow Behavior:</b>   
00063  *   
00064  * \par   
00065  * The function is implemented using a 64-bit internal accumulator.   
00066  * Both inputs are in 1.15 format and multiplications yield a 2.30 result.   
00067  * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.   
00068  * This approach provides 33 guard bits and there is no risk of overflow.   
00069  * The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.   
00070  *   
00071  * \par   
00072  * Refer to <code>arm_correlate_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4. 
00073  *
00074  * \par    
00075  * Refer the function <code>arm_correlate_opt_q15()</code> for a faster implementation of this function using scratch buffers.
00076  * 
00077  */
00078 
00079 void arm_correlate_q15(
00080   q15_t * pSrcA,
00081   uint32_t srcALen,
00082   q15_t * pSrcB,
00083   uint32_t srcBLen,
00084   q15_t * pDst)
00085 {
00086 
00087 #if (defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE)
00088 
00089   /* Run the below code for Cortex-M4 and Cortex-M3 */
00090 
00091   q15_t *pIn1;                                   /* inputA pointer               */
00092   q15_t *pIn2;                                   /* inputB pointer               */
00093   q15_t *pOut = pDst;                            /* output pointer               */
00094   q63_t sum, acc0, acc1, acc2, acc3;             /* Accumulators                  */
00095   q15_t *px;                                     /* Intermediate inputA pointer  */
00096   q15_t *py;                                     /* Intermediate inputB pointer  */
00097   q15_t *pSrc1;                                  /* Intermediate pointers        */
00098   q31_t x0, x1, x2, x3, c0;                      /* temporary variables for holding input and coefficient values */
00099   uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3;  /* loop counter                 */
00100   int32_t inc = 1;                               /* Destination address modifier */
00101 
00102 
00103   /* The algorithm implementation is based on the lengths of the inputs. */
00104   /* srcB is always made to slide across srcA. */
00105   /* So srcBLen is always considered as shorter or equal to srcALen */
00106   /* But CORR(x, y) is reverse of CORR(y, x) */
00107   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
00108   /* and the destination pointer modifier, inc is set to -1 */
00109   /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
00110   /* But to improve the performance,   
00111    * we include zeroes in the output instead of zero padding either of the the inputs*/
00112   /* If srcALen > srcBLen,   
00113    * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
00114   /* If srcALen < srcBLen,   
00115    * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
00116   if(srcALen >= srcBLen)
00117   {
00118     /* Initialization of inputA pointer */
00119     pIn1 = (pSrcA);
00120 
00121     /* Initialization of inputB pointer */
00122     pIn2 = (pSrcB);
00123 
00124     /* Number of output samples is calculated */
00125     outBlockSize = (2u * srcALen) - 1u;
00126 
00127     /* When srcALen > srcBLen, zero padding is done to srcB   
00128      * to make their lengths equal.   
00129      * Instead, (outBlockSize - (srcALen + srcBLen - 1))   
00130      * number of output samples are made zero */
00131     j = outBlockSize - (srcALen + (srcBLen - 1u));
00132 
00133     /* Updating the pointer position to non zero value */
00134     pOut += j;
00135 
00136   }
00137   else
00138   {
00139     /* Initialization of inputA pointer */
00140     pIn1 = (pSrcB);
00141 
00142     /* Initialization of inputB pointer */
00143     pIn2 = (pSrcA);
00144 
00145     /* srcBLen is always considered as shorter or equal to srcALen */
00146     j = srcBLen;
00147     srcBLen = srcALen;
00148     srcALen = j;
00149 
00150     /* CORR(x, y) = Reverse order(CORR(y, x)) */
00151     /* Hence set the destination pointer to point to the last output sample */
00152     pOut = pDst + ((srcALen + srcBLen) - 2u);
00153 
00154     /* Destination address modifier is set to -1 */
00155     inc = -1;
00156 
00157   }
00158 
00159   /* The function is internally   
00160    * divided into three parts according to the number of multiplications that has to be   
00161    * taken place between inputA samples and inputB samples. In the first part of the   
00162    * algorithm, the multiplications increase by one for every iteration.   
00163    * In the second part of the algorithm, srcBLen number of multiplications are done.   
00164    * In the third part of the algorithm, the multiplications decrease by one   
00165    * for every iteration.*/
00166   /* The algorithm is implemented in three stages.   
00167    * The loop counters of each stage is initiated here. */
00168   blockSize1 = srcBLen - 1u;
00169   blockSize2 = srcALen - (srcBLen - 1u);
00170   blockSize3 = blockSize1;
00171 
00172   /* --------------------------   
00173    * Initializations of stage1   
00174    * -------------------------*/
00175 
00176   /* sum = x[0] * y[srcBlen - 1]   
00177    * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]   
00178    * ....   
00179    * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]   
00180    */
00181 
00182   /* In this stage the MAC operations are increased by 1 for every iteration.   
00183      The count variable holds the number of MAC operations performed */
00184   count = 1u;
00185 
00186   /* Working pointer of inputA */
00187   px = pIn1;
00188 
00189   /* Working pointer of inputB */
00190   pSrc1 = pIn2 + (srcBLen - 1u);
00191   py = pSrc1;
00192 
00193   /* ------------------------   
00194    * Stage1 process   
00195    * ----------------------*/
00196 
00197   /* The first loop starts here */
00198   while(blockSize1 > 0u)
00199   {
00200     /* Accumulator is made zero for every iteration */
00201     sum = 0;
00202 
00203     /* Apply loop unrolling and compute 4 MACs simultaneously. */
00204     k = count >> 2;
00205 
00206     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00207      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00208     while(k > 0u)
00209     {
00210       /* x[0] * y[srcBLen - 4] , x[1] * y[srcBLen - 3] */
00211       sum = __SMLALD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
00212       /* x[3] * y[srcBLen - 1] , x[2] * y[srcBLen - 2] */
00213       sum = __SMLALD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
00214 
00215       /* Decrement the loop counter */
00216       k--;
00217     }
00218 
00219     /* If the count is not a multiple of 4, compute any remaining MACs here.   
00220      ** No loop unrolling is used. */
00221     k = count % 0x4u;
00222 
00223     while(k > 0u)
00224     {
00225       /* Perform the multiply-accumulates */
00226       /* x[0] * y[srcBLen - 1] */
00227       sum = __SMLALD(*px++, *py++, sum);
00228 
00229       /* Decrement the loop counter */
00230       k--;
00231     }
00232 
00233     /* Store the result in the accumulator in the destination buffer. */
00234     *pOut = (q15_t) (__SSAT((sum >> 15), 16));
00235     /* Destination pointer is updated according to the address modifier, inc */
00236     pOut += inc;
00237 
00238     /* Update the inputA and inputB pointers for next MAC calculation */
00239     py = pSrc1 - count;
00240     px = pIn1;
00241 
00242     /* Increment the MAC count */
00243     count++;
00244 
00245     /* Decrement the loop counter */
00246     blockSize1--;
00247   }
00248 
00249   /* --------------------------   
00250    * Initializations of stage2   
00251    * ------------------------*/
00252 
00253   /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]   
00254    * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]   
00255    * ....   
00256    * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]   
00257    */
00258 
00259   /* Working pointer of inputA */
00260   px = pIn1;
00261 
00262   /* Working pointer of inputB */
00263   py = pIn2;
00264 
00265   /* count is index by which the pointer pIn1 to be incremented */
00266   count = 0u;
00267 
00268   /* -------------------   
00269    * Stage2 process   
00270    * ------------------*/
00271 
00272   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.   
00273    * So, to loop unroll over blockSize2,   
00274    * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */
00275   if(srcBLen >= 4u)
00276   {
00277     /* Loop unroll over blockSize2, by 4 */
00278     blkCnt = blockSize2 >> 2u;
00279 
00280     while(blkCnt > 0u)
00281     {
00282       /* Set all accumulators to zero */
00283       acc0 = 0;
00284       acc1 = 0;
00285       acc2 = 0;
00286       acc3 = 0;
00287 
00288       /* read x[0], x[1] samples */
00289       x0 = *__SIMD32(px);
00290       /* read x[1], x[2] samples */
00291       x1 = _SIMD32_OFFSET(px + 1);
00292       px += 2u;
00293 
00294       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00295       k = srcBLen >> 2u;
00296 
00297       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00298        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00299       do
00300       {
00301         /* Read the first two inputB samples using SIMD:   
00302          * y[0] and y[1] */
00303         c0 = *__SIMD32(py)++;
00304 
00305         /* acc0 +=  x[0] * y[0] + x[1] * y[1] */
00306         acc0 = __SMLALD(x0, c0, acc0);
00307 
00308         /* acc1 +=  x[1] * y[0] + x[2] * y[1] */
00309         acc1 = __SMLALD(x1, c0, acc1);
00310 
00311         /* Read x[2], x[3] */
00312         x2 = *__SIMD32(px);
00313 
00314         /* Read x[3], x[4] */
00315         x3 = _SIMD32_OFFSET(px + 1);
00316 
00317         /* acc2 +=  x[2] * y[0] + x[3] * y[1] */
00318         acc2 = __SMLALD(x2, c0, acc2);
00319 
00320         /* acc3 +=  x[3] * y[0] + x[4] * y[1] */
00321         acc3 = __SMLALD(x3, c0, acc3);
00322 
00323         /* Read y[2] and y[3] */
00324         c0 = *__SIMD32(py)++;
00325 
00326         /* acc0 +=  x[2] * y[2] + x[3] * y[3] */
00327         acc0 = __SMLALD(x2, c0, acc0);
00328 
00329         /* acc1 +=  x[3] * y[2] + x[4] * y[3] */
00330         acc1 = __SMLALD(x3, c0, acc1);
00331 
00332         /* Read x[4], x[5] */
00333         x0 = _SIMD32_OFFSET(px + 2);
00334 
00335         /* Read x[5], x[6] */
00336         x1 = _SIMD32_OFFSET(px + 3);
00337 
00338         px += 4u;
00339 
00340         /* acc2 +=  x[4] * y[2] + x[5] * y[3] */
00341         acc2 = __SMLALD(x0, c0, acc2);
00342 
00343         /* acc3 +=  x[5] * y[2] + x[6] * y[3] */
00344         acc3 = __SMLALD(x1, c0, acc3);
00345 
00346       } while(--k);
00347 
00348       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.   
00349        ** No loop unrolling is used. */
00350       k = srcBLen % 0x4u;
00351 
00352       if(k == 1u)
00353       {
00354         /* Read y[4] */
00355         c0 = *py;
00356 #ifdef  ARM_MATH_BIG_ENDIAN
00357 
00358         c0 = c0 << 16u;
00359 
00360 #else
00361 
00362         c0 = c0 & 0x0000FFFF;
00363 
00364 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
00365         /* Read x[7] */
00366         x3 = *__SIMD32(px);
00367         px++;
00368 
00369         /* Perform the multiply-accumulates */
00370         acc0 = __SMLALD(x0, c0, acc0);
00371         acc1 = __SMLALD(x1, c0, acc1);
00372         acc2 = __SMLALDX(x1, c0, acc2);
00373         acc3 = __SMLALDX(x3, c0, acc3);
00374       }
00375 
00376       if(k == 2u)
00377       {
00378         /* Read y[4], y[5] */
00379         c0 = *__SIMD32(py);
00380 
00381         /* Read x[7], x[8] */
00382         x3 = *__SIMD32(px);
00383 
00384         /* Read x[9] */
00385         x2 = _SIMD32_OFFSET(px + 1);
00386         px += 2u;
00387 
00388         /* Perform the multiply-accumulates */
00389         acc0 = __SMLALD(x0, c0, acc0);
00390         acc1 = __SMLALD(x1, c0, acc1);
00391         acc2 = __SMLALD(x3, c0, acc2);
00392         acc3 = __SMLALD(x2, c0, acc3);
00393       }
00394 
00395       if(k == 3u)
00396       {
00397         /* Read y[4], y[5] */
00398         c0 = *__SIMD32(py)++;
00399 
00400         /* Read x[7], x[8] */
00401         x3 = *__SIMD32(px);
00402 
00403         /* Read x[9] */
00404         x2 = _SIMD32_OFFSET(px + 1);
00405 
00406         /* Perform the multiply-accumulates */
00407         acc0 = __SMLALD(x0, c0, acc0);
00408         acc1 = __SMLALD(x1, c0, acc1);
00409         acc2 = __SMLALD(x3, c0, acc2);
00410         acc3 = __SMLALD(x2, c0, acc3);
00411 
00412         c0 = (*py);
00413 
00414         /* Read y[6] */
00415 #ifdef  ARM_MATH_BIG_ENDIAN
00416 
00417         c0 = c0 << 16u;
00418 #else
00419 
00420         c0 = c0 & 0x0000FFFF;
00421 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
00422         /* Read x[10] */
00423         x3 = _SIMD32_OFFSET(px + 2);
00424         px += 3u;
00425 
00426         /* Perform the multiply-accumulates */
00427         acc0 = __SMLALDX(x1, c0, acc0);
00428         acc1 = __SMLALD(x2, c0, acc1);
00429         acc2 = __SMLALDX(x2, c0, acc2);
00430         acc3 = __SMLALDX(x3, c0, acc3);
00431       }
00432 
00433       /* Store the result in the accumulator in the destination buffer. */
00434       *pOut = (q15_t) (__SSAT(acc0 >> 15, 16));
00435       /* Destination pointer is updated according to the address modifier, inc */
00436       pOut += inc;
00437 
00438       *pOut = (q15_t) (__SSAT(acc1 >> 15, 16));
00439       pOut += inc;
00440 
00441       *pOut = (q15_t) (__SSAT(acc2 >> 15, 16));
00442       pOut += inc;
00443 
00444       *pOut = (q15_t) (__SSAT(acc3 >> 15, 16));
00445       pOut += inc;
00446 
00447       /* Increment the count by 4 as 4 output values are computed */
00448       count += 4u;
00449 
00450       /* Update the inputA and inputB pointers for next MAC calculation */
00451       px = pIn1 + count;
00452       py = pIn2;
00453 
00454       /* Decrement the loop counter */
00455       blkCnt--;
00456     }
00457 
00458     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.   
00459      ** No loop unrolling is used. */
00460     blkCnt = blockSize2 % 0x4u;
00461 
00462     while(blkCnt > 0u)
00463     {
00464       /* Accumulator is made zero for every iteration */
00465       sum = 0;
00466 
00467       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00468       k = srcBLen >> 2u;
00469 
00470       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00471        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00472       while(k > 0u)
00473       {
00474         /* Perform the multiply-accumulates */
00475         sum += ((q63_t) * px++ * *py++);
00476         sum += ((q63_t) * px++ * *py++);
00477         sum += ((q63_t) * px++ * *py++);
00478         sum += ((q63_t) * px++ * *py++);
00479 
00480         /* Decrement the loop counter */
00481         k--;
00482       }
00483 
00484       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.   
00485        ** No loop unrolling is used. */
00486       k = srcBLen % 0x4u;
00487 
00488       while(k > 0u)
00489       {
00490         /* Perform the multiply-accumulates */
00491         sum += ((q63_t) * px++ * *py++);
00492 
00493         /* Decrement the loop counter */
00494         k--;
00495       }
00496 
00497       /* Store the result in the accumulator in the destination buffer. */
00498       *pOut = (q15_t) (__SSAT(sum >> 15, 16));
00499       /* Destination pointer is updated according to the address modifier, inc */
00500       pOut += inc;
00501 
00502       /* Increment count by 1, as one output value is computed */
00503       count++;
00504 
00505       /* Update the inputA and inputB pointers for next MAC calculation */
00506       px = pIn1 + count;
00507       py = pIn2;
00508 
00509       /* Decrement the loop counter */
00510       blkCnt--;
00511     }
00512   }
00513   else
00514   {
00515     /* If the srcBLen is not a multiple of 4,   
00516      * the blockSize2 loop cannot be unrolled by 4 */
00517     blkCnt = blockSize2;
00518 
00519     while(blkCnt > 0u)
00520     {
00521       /* Accumulator is made zero for every iteration */
00522       sum = 0;
00523 
00524       /* Loop over srcBLen */
00525       k = srcBLen;
00526 
00527       while(k > 0u)
00528       {
00529         /* Perform the multiply-accumulate */
00530         sum += ((q63_t) * px++ * *py++);
00531 
00532         /* Decrement the loop counter */
00533         k--;
00534       }
00535 
00536       /* Store the result in the accumulator in the destination buffer. */
00537       *pOut = (q15_t) (__SSAT(sum >> 15, 16));
00538       /* Destination pointer is updated according to the address modifier, inc */
00539       pOut += inc;
00540 
00541       /* Increment the MAC count */
00542       count++;
00543 
00544       /* Update the inputA and inputB pointers for next MAC calculation */
00545       px = pIn1 + count;
00546       py = pIn2;
00547 
00548       /* Decrement the loop counter */
00549       blkCnt--;
00550     }
00551   }
00552 
00553   /* --------------------------   
00554    * Initializations of stage3   
00555    * -------------------------*/
00556 
00557   /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]   
00558    * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]   
00559    * ....   
00560    * sum +=  x[srcALen-2] * y[0] + x[srcALen-1] * y[1]   
00561    * sum +=  x[srcALen-1] * y[0]   
00562    */
00563 
00564   /* In this stage the MAC operations are decreased by 1 for every iteration.   
00565      The count variable holds the number of MAC operations performed */
00566   count = srcBLen - 1u;
00567 
00568   /* Working pointer of inputA */
00569   pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
00570   px = pSrc1;
00571 
00572   /* Working pointer of inputB */
00573   py = pIn2;
00574 
00575   /* -------------------   
00576    * Stage3 process   
00577    * ------------------*/
00578 
00579   while(blockSize3 > 0u)
00580   {
00581     /* Accumulator is made zero for every iteration */
00582     sum = 0;
00583 
00584     /* Apply loop unrolling and compute 4 MACs simultaneously. */
00585     k = count >> 2u;
00586 
00587     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00588      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00589     while(k > 0u)
00590     {
00591       /* Perform the multiply-accumulates */
00592       /* sum += x[srcALen - srcBLen + 4] * y[3] , sum += x[srcALen - srcBLen + 3] * y[2] */
00593       sum = __SMLALD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
00594       /* sum += x[srcALen - srcBLen + 2] * y[1] , sum += x[srcALen - srcBLen + 1] * y[0] */
00595       sum = __SMLALD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
00596 
00597       /* Decrement the loop counter */
00598       k--;
00599     }
00600 
00601     /* If the count is not a multiple of 4, compute any remaining MACs here.   
00602      ** No loop unrolling is used. */
00603     k = count % 0x4u;
00604 
00605     while(k > 0u)
00606     {
00607       /* Perform the multiply-accumulates */
00608       sum = __SMLALD(*px++, *py++, sum);
00609 
00610       /* Decrement the loop counter */
00611       k--;
00612     }
00613 
00614     /* Store the result in the accumulator in the destination buffer. */
00615     *pOut = (q15_t) (__SSAT((sum >> 15), 16));
00616     /* Destination pointer is updated according to the address modifier, inc */
00617     pOut += inc;
00618 
00619     /* Update the inputA and inputB pointers for next MAC calculation */
00620     px = ++pSrc1;
00621     py = pIn2;
00622 
00623     /* Decrement the MAC count */
00624     count--;
00625 
00626     /* Decrement the loop counter */
00627     blockSize3--;
00628   }
00629 
00630 #else
00631 
00632 /* Run the below code for Cortex-M0 */
00633 
00634   q15_t *pIn1 = pSrcA;                           /* inputA pointer               */
00635   q15_t *pIn2 = pSrcB + (srcBLen - 1u);          /* inputB pointer               */
00636   q63_t sum;                                     /* Accumulators                  */
00637   uint32_t i = 0u, j;                            /* loop counters */
00638   uint32_t inv = 0u;                             /* Reverse order flag */
00639   uint32_t tot = 0u;                             /* Length */
00640 
00641   /* The algorithm implementation is based on the lengths of the inputs. */
00642   /* srcB is always made to slide across srcA. */
00643   /* So srcBLen is always considered as shorter or equal to srcALen */
00644   /* But CORR(x, y) is reverse of CORR(y, x) */
00645   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
00646   /* and a varaible, inv is set to 1 */
00647   /* If lengths are not equal then zero pad has to be done to  make the two   
00648    * inputs of same length. But to improve the performance, we include zeroes   
00649    * in the output instead of zero padding either of the the inputs*/
00650   /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the   
00651    * starting of the output buffer */
00652   /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the  
00653    * ending of the output buffer */
00654   /* Once the zero padding is done the remaining of the output is calcualted  
00655    * using convolution but with the shorter signal time shifted. */
00656 
00657   /* Calculate the length of the remaining sequence */
00658   tot = ((srcALen + srcBLen) - 2u);
00659 
00660   if(srcALen > srcBLen)
00661   {
00662     /* Calculating the number of zeros to be padded to the output */
00663     j = srcALen - srcBLen;
00664 
00665     /* Initialise the pointer after zero padding */
00666     pDst += j;
00667   }
00668 
00669   else if(srcALen < srcBLen)
00670   {
00671     /* Initialization to inputB pointer */
00672     pIn1 = pSrcB;
00673 
00674     /* Initialization to the end of inputA pointer */
00675     pIn2 = pSrcA + (srcALen - 1u);
00676 
00677     /* Initialisation of the pointer after zero padding */
00678     pDst = pDst + tot;
00679 
00680     /* Swapping the lengths */
00681     j = srcALen;
00682     srcALen = srcBLen;
00683     srcBLen = j;
00684 
00685     /* Setting the reverse flag */
00686     inv = 1;
00687 
00688   }
00689 
00690   /* Loop to calculate convolution for output length number of times */
00691   for (i = 0u; i <= tot; i++)
00692   {
00693     /* Initialize sum with zero to carry on MAC operations */
00694     sum = 0;
00695 
00696     /* Loop to perform MAC operations according to convolution equation */
00697     for (j = 0u; j <= i; j++)
00698     {
00699       /* Check the array limitations */
00700       if((((i - j) < srcBLen) && (j < srcALen)))
00701       {
00702         /* z[i] += x[i-j] * y[j] */
00703         sum += ((q31_t) pIn1[j] * pIn2[-((int32_t) i - j)]);
00704       }
00705     }
00706     /* Store the output in the destination buffer */
00707     if(inv == 1)
00708       *pDst-- = (q15_t) __SSAT((sum >> 15u), 16u);
00709     else
00710       *pDst++ = (q15_t) __SSAT((sum >> 15u), 16u);
00711   }
00712 
00713 #endif /*#if (defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE) */
00714 
00715 }
00716 
00717 /**   
00718  * @} end of Corr group   
00719  */