CMSIS DSP library

Dependents:   KL25Z_FFT_Demo Hat_Board_v5_1 KL25Z_FFT_Demo_tony KL25Z_FFT_Demo_tony ... more

Fork of mbed-dsp by mbed official

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_correlate_fast_q15.c Source File

arm_correlate_fast_q15.c

00001 /* ----------------------------------------------------------------------   
00002 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.   
00003 *   
00004 * $Date:        17. January 2013
00005 * $Revision:    V1.4.1
00006 *   
00007 * Project:      CMSIS DSP Library   
00008 * Title:        arm_correlate_fast_q15.c   
00009 *   
00010 * Description:  Fast Q15 Correlation.   
00011 *   
00012 * Target Processor: Cortex-M4/Cortex-M3
00013 *  
00014 * Redistribution and use in source and binary forms, with or without 
00015 * modification, are permitted provided that the following conditions
00016 * are met:
00017 *   - Redistributions of source code must retain the above copyright
00018 *     notice, this list of conditions and the following disclaimer.
00019 *   - Redistributions in binary form must reproduce the above copyright
00020 *     notice, this list of conditions and the following disclaimer in
00021 *     the documentation and/or other materials provided with the 
00022 *     distribution.
00023 *   - Neither the name of ARM LIMITED nor the names of its contributors
00024 *     may be used to endorse or promote products derived from this
00025 *     software without specific prior written permission.
00026 *
00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00038 * POSSIBILITY OF SUCH DAMAGE.   
00039 * -------------------------------------------------------------------- */
00040 
00041 #include "arm_math.h"
00042 
00043 /**   
00044  * @ingroup groupFilters   
00045  */
00046 
00047 /**   
00048  * @addtogroup Corr   
00049  * @{   
00050  */
00051 
00052 /**   
00053  * @brief Correlation of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4.   
00054  * @param[in] *pSrcA points to the first input sequence.   
00055  * @param[in] srcALen length of the first input sequence.   
00056  * @param[in] *pSrcB points to the second input sequence.   
00057  * @param[in] srcBLen length of the second input sequence.   
00058  * @param[out] *pDst points to the location where the output result is written.  Length 2 * max(srcALen, srcBLen) - 1.   
00059  * @return none.   
00060  *   
00061  * <b>Scaling and Overflow Behavior:</b>   
00062  *   
00063  * \par   
00064  * This fast version uses a 32-bit accumulator with 2.30 format.   
00065  * The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit.   
00066  * There is no saturation on intermediate additions.   
00067  * Thus, if the accumulator overflows it wraps around and distorts the result.   
00068  * The input signals should be scaled down to avoid intermediate overflows.   
00069  * Scale down one of the inputs by 1/min(srcALen, srcBLen) to avoid overflow since a   
00070  * maximum of min(srcALen, srcBLen) number of additions is carried internally.   
00071  * The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result.   
00072  *   
00073  * \par   
00074  * See <code>arm_correlate_q15()</code> for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion.   
00075  */
00076 
00077 void arm_correlate_fast_q15(
00078   q15_t * pSrcA,
00079   uint32_t srcALen,
00080   q15_t * pSrcB,
00081   uint32_t srcBLen,
00082   q15_t * pDst)
00083 {
00084 #ifndef UNALIGNED_SUPPORT_DISABLE
00085 
00086   q15_t *pIn1;                                   /* inputA pointer               */
00087   q15_t *pIn2;                                   /* inputB pointer               */
00088   q15_t *pOut = pDst;                            /* output pointer               */
00089   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulators                  */
00090   q15_t *px;                                     /* Intermediate inputA pointer  */
00091   q15_t *py;                                     /* Intermediate inputB pointer  */
00092   q15_t *pSrc1;                                  /* Intermediate pointers        */
00093   q31_t x0, x1, x2, x3, c0;                      /* temporary variables for holding input and coefficient values */
00094   uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3;  /* loop counter                 */
00095   int32_t inc = 1;                               /* Destination address modifier */
00096 
00097 
00098   /* The algorithm implementation is based on the lengths of the inputs. */
00099   /* srcB is always made to slide across srcA. */
00100   /* So srcBLen is always considered as shorter or equal to srcALen */
00101   /* But CORR(x, y) is reverse of CORR(y, x) */
00102   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
00103   /* and the destination pointer modifier, inc is set to -1 */
00104   /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
00105   /* But to improve the performance,   
00106    * we include zeroes in the output instead of zero padding either of the the inputs*/
00107   /* If srcALen > srcBLen,   
00108    * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
00109   /* If srcALen < srcBLen,   
00110    * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
00111   if(srcALen >= srcBLen)
00112   {
00113     /* Initialization of inputA pointer */
00114     pIn1 = (pSrcA);
00115 
00116     /* Initialization of inputB pointer */
00117     pIn2 = (pSrcB);
00118 
00119     /* Number of output samples is calculated */
00120     outBlockSize = (2u * srcALen) - 1u;
00121 
00122     /* When srcALen > srcBLen, zero padding is done to srcB   
00123      * to make their lengths equal.   
00124      * Instead, (outBlockSize - (srcALen + srcBLen - 1))   
00125      * number of output samples are made zero */
00126     j = outBlockSize - (srcALen + (srcBLen - 1u));
00127 
00128     /* Updating the pointer position to non zero value */
00129     pOut += j;
00130 
00131   }
00132   else
00133   {
00134     /* Initialization of inputA pointer */
00135     pIn1 = (pSrcB);
00136 
00137     /* Initialization of inputB pointer */
00138     pIn2 = (pSrcA);
00139 
00140     /* srcBLen is always considered as shorter or equal to srcALen */
00141     j = srcBLen;
00142     srcBLen = srcALen;
00143     srcALen = j;
00144 
00145     /* CORR(x, y) = Reverse order(CORR(y, x)) */
00146     /* Hence set the destination pointer to point to the last output sample */
00147     pOut = pDst + ((srcALen + srcBLen) - 2u);
00148 
00149     /* Destination address modifier is set to -1 */
00150     inc = -1;
00151 
00152   }
00153 
00154   /* The function is internally   
00155    * divided into three parts according to the number of multiplications that has to be   
00156    * taken place between inputA samples and inputB samples. In the first part of the   
00157    * algorithm, the multiplications increase by one for every iteration.   
00158    * In the second part of the algorithm, srcBLen number of multiplications are done.   
00159    * In the third part of the algorithm, the multiplications decrease by one   
00160    * for every iteration.*/
00161   /* The algorithm is implemented in three stages.   
00162    * The loop counters of each stage is initiated here. */
00163   blockSize1 = srcBLen - 1u;
00164   blockSize2 = srcALen - (srcBLen - 1u);
00165   blockSize3 = blockSize1;
00166 
00167   /* --------------------------   
00168    * Initializations of stage1   
00169    * -------------------------*/
00170 
00171   /* sum = x[0] * y[srcBlen - 1]   
00172    * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]   
00173    * ....   
00174    * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]   
00175    */
00176 
00177   /* In this stage the MAC operations are increased by 1 for every iteration.   
00178      The count variable holds the number of MAC operations performed */
00179   count = 1u;
00180 
00181   /* Working pointer of inputA */
00182   px = pIn1;
00183 
00184   /* Working pointer of inputB */
00185   pSrc1 = pIn2 + (srcBLen - 1u);
00186   py = pSrc1;
00187 
00188   /* ------------------------   
00189    * Stage1 process   
00190    * ----------------------*/
00191 
00192   /* The first loop starts here */
00193   while(blockSize1 > 0u)
00194   {
00195     /* Accumulator is made zero for every iteration */
00196     sum = 0;
00197 
00198     /* Apply loop unrolling and compute 4 MACs simultaneously. */
00199     k = count >> 2;
00200 
00201     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00202      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00203     while(k > 0u)
00204     {
00205       /* x[0] * y[srcBLen - 4] , x[1] * y[srcBLen - 3] */
00206       sum = __SMLAD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
00207       /* x[3] * y[srcBLen - 1] , x[2] * y[srcBLen - 2] */
00208       sum = __SMLAD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
00209 
00210       /* Decrement the loop counter */
00211       k--;
00212     }
00213 
00214     /* If the count is not a multiple of 4, compute any remaining MACs here.   
00215      ** No loop unrolling is used. */
00216     k = count % 0x4u;
00217 
00218     while(k > 0u)
00219     {
00220       /* Perform the multiply-accumulates */
00221       /* x[0] * y[srcBLen - 1] */
00222       sum = __SMLAD(*px++, *py++, sum);
00223 
00224       /* Decrement the loop counter */
00225       k--;
00226     }
00227 
00228     /* Store the result in the accumulator in the destination buffer. */
00229     *pOut = (q15_t) (sum >> 15);
00230     /* Destination pointer is updated according to the address modifier, inc */
00231     pOut += inc;
00232 
00233     /* Update the inputA and inputB pointers for next MAC calculation */
00234     py = pSrc1 - count;
00235     px = pIn1;
00236 
00237     /* Increment the MAC count */
00238     count++;
00239 
00240     /* Decrement the loop counter */
00241     blockSize1--;
00242   }
00243 
00244   /* --------------------------   
00245    * Initializations of stage2   
00246    * ------------------------*/
00247 
00248   /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]   
00249    * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]   
00250    * ....   
00251    * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]   
00252    */
00253 
00254   /* Working pointer of inputA */
00255   px = pIn1;
00256 
00257   /* Working pointer of inputB */
00258   py = pIn2;
00259 
00260   /* count is index by which the pointer pIn1 to be incremented */
00261   count = 0u;
00262 
00263   /* -------------------   
00264    * Stage2 process   
00265    * ------------------*/
00266 
00267   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.   
00268    * So, to loop unroll over blockSize2,   
00269    * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */
00270   if(srcBLen >= 4u)
00271   {
00272     /* Loop unroll over blockSize2, by 4 */
00273     blkCnt = blockSize2 >> 2u;
00274 
00275     while(blkCnt > 0u)
00276     {
00277       /* Set all accumulators to zero */
00278       acc0 = 0;
00279       acc1 = 0;
00280       acc2 = 0;
00281       acc3 = 0;
00282 
00283       /* read x[0], x[1] samples */
00284       x0 = *__SIMD32(px);
00285       /* read x[1], x[2] samples */
00286       x1 = _SIMD32_OFFSET(px + 1);
00287       px += 2u;
00288 
00289       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00290       k = srcBLen >> 2u;
00291 
00292       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00293        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00294       do
00295       {
00296         /* Read the first two inputB samples using SIMD:   
00297          * y[0] and y[1] */
00298         c0 = *__SIMD32(py)++;
00299 
00300         /* acc0 +=  x[0] * y[0] + x[1] * y[1] */
00301         acc0 = __SMLAD(x0, c0, acc0);
00302 
00303         /* acc1 +=  x[1] * y[0] + x[2] * y[1] */
00304         acc1 = __SMLAD(x1, c0, acc1);
00305 
00306         /* Read x[2], x[3] */
00307         x2 = *__SIMD32(px);
00308 
00309         /* Read x[3], x[4] */
00310         x3 = _SIMD32_OFFSET(px + 1);
00311 
00312         /* acc2 +=  x[2] * y[0] + x[3] * y[1] */
00313         acc2 = __SMLAD(x2, c0, acc2);
00314 
00315         /* acc3 +=  x[3] * y[0] + x[4] * y[1] */
00316         acc3 = __SMLAD(x3, c0, acc3);
00317 
00318         /* Read y[2] and y[3] */
00319         c0 = *__SIMD32(py)++;
00320 
00321         /* acc0 +=  x[2] * y[2] + x[3] * y[3] */
00322         acc0 = __SMLAD(x2, c0, acc0);
00323 
00324         /* acc1 +=  x[3] * y[2] + x[4] * y[3] */
00325         acc1 = __SMLAD(x3, c0, acc1);
00326 
00327         /* Read x[4], x[5] */
00328         x0 = _SIMD32_OFFSET(px + 2);
00329 
00330         /* Read x[5], x[6] */
00331         x1 = _SIMD32_OFFSET(px + 3);
00332         px += 4u;
00333 
00334         /* acc2 +=  x[4] * y[2] + x[5] * y[3] */
00335         acc2 = __SMLAD(x0, c0, acc2);
00336 
00337         /* acc3 +=  x[5] * y[2] + x[6] * y[3] */
00338         acc3 = __SMLAD(x1, c0, acc3);
00339 
00340       } while(--k);
00341 
00342       /* For the next MAC operations, SIMD is not used   
00343        * So, the 16 bit pointer if inputB, py is updated */
00344 
00345       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.   
00346        ** No loop unrolling is used. */
00347       k = srcBLen % 0x4u;
00348 
00349       if(k == 1u)
00350       {
00351         /* Read y[4] */
00352         c0 = *py;
00353 #ifdef  ARM_MATH_BIG_ENDIAN
00354 
00355         c0 = c0 << 16u;
00356 
00357 #else
00358 
00359         c0 = c0 & 0x0000FFFF;
00360 
00361 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
00362 
00363         /* Read x[7] */
00364         x3 = *__SIMD32(px);
00365         px++;
00366 
00367         /* Perform the multiply-accumulates */
00368         acc0 = __SMLAD(x0, c0, acc0);
00369         acc1 = __SMLAD(x1, c0, acc1);
00370         acc2 = __SMLADX(x1, c0, acc2);
00371         acc3 = __SMLADX(x3, c0, acc3);
00372       }
00373 
00374       if(k == 2u)
00375       {
00376         /* Read y[4], y[5] */
00377         c0 = *__SIMD32(py);
00378 
00379         /* Read x[7], x[8] */
00380         x3 = *__SIMD32(px);
00381 
00382         /* Read x[9] */
00383         x2 = _SIMD32_OFFSET(px + 1);
00384         px += 2u;
00385 
00386         /* Perform the multiply-accumulates */
00387         acc0 = __SMLAD(x0, c0, acc0);
00388         acc1 = __SMLAD(x1, c0, acc1);
00389         acc2 = __SMLAD(x3, c0, acc2);
00390         acc3 = __SMLAD(x2, c0, acc3);
00391       }
00392 
00393       if(k == 3u)
00394       {
00395         /* Read y[4], y[5] */
00396         c0 = *__SIMD32(py)++;
00397 
00398         /* Read x[7], x[8] */
00399         x3 = *__SIMD32(px);
00400 
00401         /* Read x[9] */
00402         x2 = _SIMD32_OFFSET(px + 1);
00403 
00404         /* Perform the multiply-accumulates */
00405         acc0 = __SMLAD(x0, c0, acc0);
00406         acc1 = __SMLAD(x1, c0, acc1);
00407         acc2 = __SMLAD(x3, c0, acc2);
00408         acc3 = __SMLAD(x2, c0, acc3);
00409 
00410         c0 = (*py);
00411         /* Read y[6] */
00412 #ifdef  ARM_MATH_BIG_ENDIAN
00413 
00414         c0 = c0 << 16u;
00415 #else
00416 
00417         c0 = c0 & 0x0000FFFF;
00418 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
00419 
00420         /* Read x[10] */
00421         x3 = _SIMD32_OFFSET(px + 2);
00422         px += 3u;
00423 
00424         /* Perform the multiply-accumulates */
00425         acc0 = __SMLADX(x1, c0, acc0);
00426         acc1 = __SMLAD(x2, c0, acc1);
00427         acc2 = __SMLADX(x2, c0, acc2);
00428         acc3 = __SMLADX(x3, c0, acc3);
00429       }
00430 
00431       /* Store the result in the accumulator in the destination buffer. */
00432       *pOut = (q15_t) (acc0 >> 15);
00433       /* Destination pointer is updated according to the address modifier, inc */
00434       pOut += inc;
00435 
00436       *pOut = (q15_t) (acc1 >> 15);
00437       pOut += inc;
00438 
00439       *pOut = (q15_t) (acc2 >> 15);
00440       pOut += inc;
00441 
00442       *pOut = (q15_t) (acc3 >> 15);
00443       pOut += inc;
00444 
00445       /* Increment the pointer pIn1 index, count by 1 */
00446       count += 4u;
00447 
00448       /* Update the inputA and inputB pointers for next MAC calculation */
00449       px = pIn1 + count;
00450       py = pIn2;
00451 
00452 
00453       /* Decrement the loop counter */
00454       blkCnt--;
00455     }
00456 
00457     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.   
00458      ** No loop unrolling is used. */
00459     blkCnt = blockSize2 % 0x4u;
00460 
00461     while(blkCnt > 0u)
00462     {
00463       /* Accumulator is made zero for every iteration */
00464       sum = 0;
00465 
00466       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00467       k = srcBLen >> 2u;
00468 
00469       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00470        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00471       while(k > 0u)
00472       {
00473         /* Perform the multiply-accumulates */
00474         sum += ((q31_t) * px++ * *py++);
00475         sum += ((q31_t) * px++ * *py++);
00476         sum += ((q31_t) * px++ * *py++);
00477         sum += ((q31_t) * px++ * *py++);
00478 
00479         /* Decrement the loop counter */
00480         k--;
00481       }
00482 
00483       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.   
00484        ** No loop unrolling is used. */
00485       k = srcBLen % 0x4u;
00486 
00487       while(k > 0u)
00488       {
00489         /* Perform the multiply-accumulates */
00490         sum += ((q31_t) * px++ * *py++);
00491 
00492         /* Decrement the loop counter */
00493         k--;
00494       }
00495 
00496       /* Store the result in the accumulator in the destination buffer. */
00497       *pOut = (q15_t) (sum >> 15);
00498       /* Destination pointer is updated according to the address modifier, inc */
00499       pOut += inc;
00500 
00501       /* Increment the pointer pIn1 index, count by 1 */
00502       count++;
00503 
00504       /* Update the inputA and inputB pointers for next MAC calculation */
00505       px = pIn1 + count;
00506       py = pIn2;
00507 
00508       /* Decrement the loop counter */
00509       blkCnt--;
00510     }
00511   }
00512   else
00513   {
00514     /* If the srcBLen is not a multiple of 4,   
00515      * the blockSize2 loop cannot be unrolled by 4 */
00516     blkCnt = blockSize2;
00517 
00518     while(blkCnt > 0u)
00519     {
00520       /* Accumulator is made zero for every iteration */
00521       sum = 0;
00522 
00523       /* Loop over srcBLen */
00524       k = srcBLen;
00525 
00526       while(k > 0u)
00527       {
00528         /* Perform the multiply-accumulate */
00529         sum += ((q31_t) * px++ * *py++);
00530 
00531         /* Decrement the loop counter */
00532         k--;
00533       }
00534 
00535       /* Store the result in the accumulator in the destination buffer. */
00536       *pOut = (q15_t) (sum >> 15);
00537       /* Destination pointer is updated according to the address modifier, inc */
00538       pOut += inc;
00539 
00540       /* Increment the MAC count */
00541       count++;
00542 
00543       /* Update the inputA and inputB pointers for next MAC calculation */
00544       px = pIn1 + count;
00545       py = pIn2;
00546 
00547       /* Decrement the loop counter */
00548       blkCnt--;
00549     }
00550   }
00551 
00552   /* --------------------------   
00553    * Initializations of stage3   
00554    * -------------------------*/
00555 
00556   /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]   
00557    * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]   
00558    * ....   
00559    * sum +=  x[srcALen-2] * y[0] + x[srcALen-1] * y[1]   
00560    * sum +=  x[srcALen-1] * y[0]   
00561    */
00562 
00563   /* In this stage the MAC operations are decreased by 1 for every iteration.   
00564      The count variable holds the number of MAC operations performed */
00565   count = srcBLen - 1u;
00566 
00567   /* Working pointer of inputA */
00568   pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
00569   px = pSrc1;
00570 
00571   /* Working pointer of inputB */
00572   py = pIn2;
00573 
00574   /* -------------------   
00575    * Stage3 process   
00576    * ------------------*/
00577 
00578   while(blockSize3 > 0u)
00579   {
00580     /* Accumulator is made zero for every iteration */
00581     sum = 0;
00582 
00583     /* Apply loop unrolling and compute 4 MACs simultaneously. */
00584     k = count >> 2u;
00585 
00586     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00587      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00588     while(k > 0u)
00589     {
00590       /* Perform the multiply-accumulates */
00591       /* sum += x[srcALen - srcBLen + 4] * y[3] , sum += x[srcALen - srcBLen + 3] * y[2] */
00592       sum = __SMLAD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
00593       /* sum += x[srcALen - srcBLen + 2] * y[1] , sum += x[srcALen - srcBLen + 1] * y[0] */
00594       sum = __SMLAD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
00595 
00596       /* Decrement the loop counter */
00597       k--;
00598     }
00599 
00600     /* If the count is not a multiple of 4, compute any remaining MACs here.   
00601      ** No loop unrolling is used. */
00602     k = count % 0x4u;
00603 
00604     while(k > 0u)
00605     {
00606       /* Perform the multiply-accumulates */
00607       sum = __SMLAD(*px++, *py++, sum);
00608 
00609       /* Decrement the loop counter */
00610       k--;
00611     }
00612 
00613     /* Store the result in the accumulator in the destination buffer. */
00614     *pOut = (q15_t) (sum >> 15);
00615     /* Destination pointer is updated according to the address modifier, inc */
00616     pOut += inc;
00617 
00618     /* Update the inputA and inputB pointers for next MAC calculation */
00619     px = ++pSrc1;
00620     py = pIn2;
00621 
00622     /* Decrement the MAC count */
00623     count--;
00624 
00625     /* Decrement the loop counter */
00626     blockSize3--;
00627   }
00628 
00629 #else
00630 
00631   q15_t *pIn1;                                   /* inputA pointer               */
00632   q15_t *pIn2;                                   /* inputB pointer               */
00633   q15_t *pOut = pDst;                            /* output pointer               */
00634   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulators                  */
00635   q15_t *px;                                     /* Intermediate inputA pointer  */
00636   q15_t *py;                                     /* Intermediate inputB pointer  */
00637   q15_t *pSrc1;                                  /* Intermediate pointers        */
00638   q31_t x0, x1, x2, x3, c0;                      /* temporary variables for holding input and coefficient values */
00639   uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3;  /* loop counter                 */
00640   int32_t inc = 1;                               /* Destination address modifier */
00641   q15_t a, b;
00642 
00643 
00644   /* The algorithm implementation is based on the lengths of the inputs. */
00645   /* srcB is always made to slide across srcA. */
00646   /* So srcBLen is always considered as shorter or equal to srcALen */
00647   /* But CORR(x, y) is reverse of CORR(y, x) */
00648   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
00649   /* and the destination pointer modifier, inc is set to -1 */
00650   /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
00651   /* But to improve the performance,   
00652    * we include zeroes in the output instead of zero padding either of the the inputs*/
00653   /* If srcALen > srcBLen,   
00654    * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
00655   /* If srcALen < srcBLen,   
00656    * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
00657   if(srcALen >= srcBLen)
00658   {
00659     /* Initialization of inputA pointer */
00660     pIn1 = (pSrcA);
00661 
00662     /* Initialization of inputB pointer */
00663     pIn2 = (pSrcB);
00664 
00665     /* Number of output samples is calculated */
00666     outBlockSize = (2u * srcALen) - 1u;
00667 
00668     /* When srcALen > srcBLen, zero padding is done to srcB   
00669      * to make their lengths equal.   
00670      * Instead, (outBlockSize - (srcALen + srcBLen - 1))   
00671      * number of output samples are made zero */
00672     j = outBlockSize - (srcALen + (srcBLen - 1u));
00673 
00674     /* Updating the pointer position to non zero value */
00675     pOut += j;
00676 
00677   }
00678   else
00679   {
00680     /* Initialization of inputA pointer */
00681     pIn1 = (pSrcB);
00682 
00683     /* Initialization of inputB pointer */
00684     pIn2 = (pSrcA);
00685 
00686     /* srcBLen is always considered as shorter or equal to srcALen */
00687     j = srcBLen;
00688     srcBLen = srcALen;
00689     srcALen = j;
00690 
00691     /* CORR(x, y) = Reverse order(CORR(y, x)) */
00692     /* Hence set the destination pointer to point to the last output sample */
00693     pOut = pDst + ((srcALen + srcBLen) - 2u);
00694 
00695     /* Destination address modifier is set to -1 */
00696     inc = -1;
00697 
00698   }
00699 
00700   /* The function is internally   
00701    * divided into three parts according to the number of multiplications that has to be   
00702    * taken place between inputA samples and inputB samples. In the first part of the   
00703    * algorithm, the multiplications increase by one for every iteration.   
00704    * In the second part of the algorithm, srcBLen number of multiplications are done.   
00705    * In the third part of the algorithm, the multiplications decrease by one   
00706    * for every iteration.*/
00707   /* The algorithm is implemented in three stages.   
00708    * The loop counters of each stage is initiated here. */
00709   blockSize1 = srcBLen - 1u;
00710   blockSize2 = srcALen - (srcBLen - 1u);
00711   blockSize3 = blockSize1;
00712 
00713   /* --------------------------   
00714    * Initializations of stage1   
00715    * -------------------------*/
00716 
00717   /* sum = x[0] * y[srcBlen - 1]   
00718    * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]   
00719    * ....   
00720    * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]   
00721    */
00722 
00723   /* In this stage the MAC operations are increased by 1 for every iteration.   
00724      The count variable holds the number of MAC operations performed */
00725   count = 1u;
00726 
00727   /* Working pointer of inputA */
00728   px = pIn1;
00729 
00730   /* Working pointer of inputB */
00731   pSrc1 = pIn2 + (srcBLen - 1u);
00732   py = pSrc1;
00733 
00734   /* ------------------------   
00735    * Stage1 process   
00736    * ----------------------*/
00737 
00738   /* The first loop starts here */
00739   while(blockSize1 > 0u)
00740   {
00741     /* Accumulator is made zero for every iteration */
00742     sum = 0;
00743 
00744     /* Apply loop unrolling and compute 4 MACs simultaneously. */
00745     k = count >> 2;
00746 
00747     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00748      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00749     while(k > 0u)
00750     {
00751       /* x[0] * y[srcBLen - 4] , x[1] * y[srcBLen - 3] */
00752         sum += ((q31_t) * px++ * *py++);
00753         sum += ((q31_t) * px++ * *py++);
00754         sum += ((q31_t) * px++ * *py++);
00755         sum += ((q31_t) * px++ * *py++);
00756 
00757       /* Decrement the loop counter */
00758       k--;
00759     }
00760 
00761     /* If the count is not a multiple of 4, compute any remaining MACs here.   
00762      ** No loop unrolling is used. */
00763     k = count % 0x4u;
00764 
00765     while(k > 0u)
00766     {
00767       /* Perform the multiply-accumulates */
00768       /* x[0] * y[srcBLen - 1] */
00769         sum += ((q31_t) * px++ * *py++);
00770 
00771       /* Decrement the loop counter */
00772       k--;
00773     }
00774 
00775     /* Store the result in the accumulator in the destination buffer. */
00776     *pOut = (q15_t) (sum >> 15);
00777     /* Destination pointer is updated according to the address modifier, inc */
00778     pOut += inc;
00779 
00780     /* Update the inputA and inputB pointers for next MAC calculation */
00781     py = pSrc1 - count;
00782     px = pIn1;
00783 
00784     /* Increment the MAC count */
00785     count++;
00786 
00787     /* Decrement the loop counter */
00788     blockSize1--;
00789   }
00790 
00791   /* --------------------------   
00792    * Initializations of stage2   
00793    * ------------------------*/
00794 
00795   /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]   
00796    * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]   
00797    * ....   
00798    * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]   
00799    */
00800 
00801   /* Working pointer of inputA */
00802   px = pIn1;
00803 
00804   /* Working pointer of inputB */
00805   py = pIn2;
00806 
00807   /* count is index by which the pointer pIn1 to be incremented */
00808   count = 0u;
00809 
00810   /* -------------------   
00811    * Stage2 process   
00812    * ------------------*/
00813 
00814   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.   
00815    * So, to loop unroll over blockSize2,   
00816    * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */
00817   if(srcBLen >= 4u)
00818   {
00819     /* Loop unroll over blockSize2, by 4 */
00820     blkCnt = blockSize2 >> 2u;
00821 
00822     while(blkCnt > 0u)
00823     {
00824       /* Set all accumulators to zero */
00825       acc0 = 0;
00826       acc1 = 0;
00827       acc2 = 0;
00828       acc3 = 0;
00829 
00830       /* read x[0], x[1], x[2] samples */
00831       a = *px;
00832       b = *(px + 1);
00833 
00834 #ifndef ARM_MATH_BIG_ENDIAN
00835 
00836       x0 = __PKHBT(a, b, 16);
00837       a = *(px + 2);
00838       x1 = __PKHBT(b, a, 16);
00839 
00840 #else
00841 
00842       x0 = __PKHBT(b, a, 16);
00843       a = *(px + 2);
00844       x1 = __PKHBT(a, b, 16);
00845 
00846 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN */
00847 
00848       px += 2u;
00849 
00850       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00851       k = srcBLen >> 2u;
00852 
00853       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00854        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00855       do
00856       {
00857         /* Read the first two inputB samples using SIMD:   
00858          * y[0] and y[1] */
00859           a = *py;
00860           b = *(py + 1);
00861     
00862 #ifndef ARM_MATH_BIG_ENDIAN
00863     
00864           c0 = __PKHBT(a, b, 16);
00865     
00866 #else
00867     
00868           c0 = __PKHBT(b, a, 16);
00869     
00870 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN */
00871 
00872         /* acc0 +=  x[0] * y[0] + x[1] * y[1] */
00873         acc0 = __SMLAD(x0, c0, acc0);
00874 
00875         /* acc1 +=  x[1] * y[0] + x[2] * y[1] */
00876         acc1 = __SMLAD(x1, c0, acc1);
00877 
00878         /* Read x[2], x[3], x[4] */
00879         a = *px;
00880         b = *(px + 1);
00881 
00882 #ifndef ARM_MATH_BIG_ENDIAN
00883 
00884         x2 = __PKHBT(a, b, 16);
00885         a = *(px + 2);
00886         x3 = __PKHBT(b, a, 16);
00887 
00888 #else
00889 
00890         x2 = __PKHBT(b, a, 16);
00891         a = *(px + 2);
00892         x3 = __PKHBT(a, b, 16);
00893 
00894 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN */
00895 
00896         /* acc2 +=  x[2] * y[0] + x[3] * y[1] */
00897         acc2 = __SMLAD(x2, c0, acc2);
00898 
00899         /* acc3 +=  x[3] * y[0] + x[4] * y[1] */
00900         acc3 = __SMLAD(x3, c0, acc3);
00901 
00902         /* Read y[2] and y[3] */
00903           a = *(py + 2);
00904           b = *(py + 3);
00905 
00906           py += 4u;
00907     
00908 #ifndef ARM_MATH_BIG_ENDIAN
00909     
00910           c0 = __PKHBT(a, b, 16);
00911     
00912 #else
00913     
00914           c0 = __PKHBT(b, a, 16);
00915     
00916 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN */
00917 
00918         /* acc0 +=  x[2] * y[2] + x[3] * y[3] */
00919         acc0 = __SMLAD(x2, c0, acc0);
00920 
00921         /* acc1 +=  x[3] * y[2] + x[4] * y[3] */
00922         acc1 = __SMLAD(x3, c0, acc1);
00923 
00924         /* Read x[4], x[5], x[6] */
00925         a = *(px + 2);
00926         b = *(px + 3);
00927 
00928 #ifndef ARM_MATH_BIG_ENDIAN
00929 
00930         x0 = __PKHBT(a, b, 16);
00931         a = *(px + 4);
00932         x1 = __PKHBT(b, a, 16);
00933 
00934 #else
00935 
00936         x0 = __PKHBT(b, a, 16);
00937         a = *(px + 4);
00938         x1 = __PKHBT(a, b, 16);
00939 
00940 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN */
00941 
00942         px += 4u;
00943 
00944         /* acc2 +=  x[4] * y[2] + x[5] * y[3] */
00945         acc2 = __SMLAD(x0, c0, acc2);
00946 
00947         /* acc3 +=  x[5] * y[2] + x[6] * y[3] */
00948         acc3 = __SMLAD(x1, c0, acc3);
00949 
00950       } while(--k);
00951 
00952       /* For the next MAC operations, SIMD is not used   
00953        * So, the 16 bit pointer if inputB, py is updated */
00954 
00955       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.   
00956        ** No loop unrolling is used. */
00957       k = srcBLen % 0x4u;
00958 
00959       if(k == 1u)
00960       {
00961         /* Read y[4] */
00962         c0 = *py;
00963 #ifdef  ARM_MATH_BIG_ENDIAN
00964 
00965         c0 = c0 << 16u;
00966 
00967 #else
00968 
00969         c0 = c0 & 0x0000FFFF;
00970 
00971 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
00972 
00973         /* Read x[7] */
00974         a = *px;
00975         b = *(px + 1);
00976 
00977         px++;;
00978     
00979 #ifndef ARM_MATH_BIG_ENDIAN
00980     
00981         x3 = __PKHBT(a, b, 16);
00982     
00983 #else
00984     
00985         x3 = __PKHBT(b, a, 16);
00986     
00987 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN */
00988 
00989         px++;
00990 
00991         /* Perform the multiply-accumulates */
00992         acc0 = __SMLAD(x0, c0, acc0);
00993         acc1 = __SMLAD(x1, c0, acc1);
00994         acc2 = __SMLADX(x1, c0, acc2);
00995         acc3 = __SMLADX(x3, c0, acc3);
00996       }
00997 
00998       if(k == 2u)
00999       {
01000         /* Read y[4], y[5] */
01001           a = *py;
01002           b = *(py + 1);
01003     
01004 #ifndef ARM_MATH_BIG_ENDIAN
01005     
01006           c0 = __PKHBT(a, b, 16);
01007     
01008 #else
01009     
01010           c0 = __PKHBT(b, a, 16);
01011     
01012 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN */
01013 
01014         /* Read x[7], x[8], x[9] */
01015         a = *px;
01016         b = *(px + 1);
01017 
01018 #ifndef ARM_MATH_BIG_ENDIAN
01019 
01020         x3 = __PKHBT(a, b, 16);
01021         a = *(px + 2);
01022         x2 = __PKHBT(b, a, 16);
01023 
01024 #else
01025 
01026         x3 = __PKHBT(b, a, 16);
01027         a = *(px + 2);
01028         x2 = __PKHBT(a, b, 16);
01029 
01030 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN */
01031 
01032         px += 2u;
01033 
01034         /* Perform the multiply-accumulates */
01035         acc0 = __SMLAD(x0, c0, acc0);
01036         acc1 = __SMLAD(x1, c0, acc1);
01037         acc2 = __SMLAD(x3, c0, acc2);
01038         acc3 = __SMLAD(x2, c0, acc3);
01039       }
01040 
01041       if(k == 3u)
01042       {
01043         /* Read y[4], y[5] */
01044           a = *py;
01045           b = *(py + 1);
01046     
01047 #ifndef ARM_MATH_BIG_ENDIAN
01048     
01049           c0 = __PKHBT(a, b, 16);
01050     
01051 #else
01052     
01053           c0 = __PKHBT(b, a, 16);
01054     
01055 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN */
01056 
01057         py += 2u;
01058 
01059         /* Read x[7], x[8], x[9] */
01060         a = *px;
01061         b = *(px + 1);
01062 
01063 #ifndef ARM_MATH_BIG_ENDIAN
01064 
01065         x3 = __PKHBT(a, b, 16);
01066         a = *(px + 2);
01067         x2 = __PKHBT(b, a, 16);
01068 
01069 #else
01070 
01071         x3 = __PKHBT(b, a, 16);
01072         a = *(px + 2);
01073         x2 = __PKHBT(a, b, 16);
01074 
01075 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN */
01076 
01077         /* Perform the multiply-accumulates */
01078         acc0 = __SMLAD(x0, c0, acc0);
01079         acc1 = __SMLAD(x1, c0, acc1);
01080         acc2 = __SMLAD(x3, c0, acc2);
01081         acc3 = __SMLAD(x2, c0, acc3);
01082 
01083         c0 = (*py);
01084         /* Read y[6] */
01085 #ifdef  ARM_MATH_BIG_ENDIAN
01086 
01087         c0 = c0 << 16u;
01088 #else
01089 
01090         c0 = c0 & 0x0000FFFF;
01091 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
01092 
01093         /* Read x[10] */
01094         b = *(px + 3);
01095     
01096 #ifndef ARM_MATH_BIG_ENDIAN
01097     
01098         x3 = __PKHBT(a, b, 16);
01099     
01100 #else
01101     
01102         x3 = __PKHBT(b, a, 16);
01103     
01104 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN */
01105 
01106         px += 3u;
01107 
01108         /* Perform the multiply-accumulates */
01109         acc0 = __SMLADX(x1, c0, acc0);
01110         acc1 = __SMLAD(x2, c0, acc1);
01111         acc2 = __SMLADX(x2, c0, acc2);
01112         acc3 = __SMLADX(x3, c0, acc3);
01113       }
01114 
01115       /* Store the result in the accumulator in the destination buffer. */
01116       *pOut = (q15_t) (acc0 >> 15);
01117       /* Destination pointer is updated according to the address modifier, inc */
01118       pOut += inc;
01119 
01120       *pOut = (q15_t) (acc1 >> 15);
01121       pOut += inc;
01122 
01123       *pOut = (q15_t) (acc2 >> 15);
01124       pOut += inc;
01125 
01126       *pOut = (q15_t) (acc3 >> 15);
01127       pOut += inc;
01128 
01129       /* Increment the pointer pIn1 index, count by 1 */
01130       count += 4u;
01131 
01132       /* Update the inputA and inputB pointers for next MAC calculation */
01133       px = pIn1 + count;
01134       py = pIn2;
01135 
01136 
01137       /* Decrement the loop counter */
01138       blkCnt--;
01139     }
01140 
01141     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.   
01142      ** No loop unrolling is used. */
01143     blkCnt = blockSize2 % 0x4u;
01144 
01145     while(blkCnt > 0u)
01146     {
01147       /* Accumulator is made zero for every iteration */
01148       sum = 0;
01149 
01150       /* Apply loop unrolling and compute 4 MACs simultaneously. */
01151       k = srcBLen >> 2u;
01152 
01153       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
01154        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
01155       while(k > 0u)
01156       {
01157         /* Perform the multiply-accumulates */
01158         sum += ((q31_t) * px++ * *py++);
01159         sum += ((q31_t) * px++ * *py++);
01160         sum += ((q31_t) * px++ * *py++);
01161         sum += ((q31_t) * px++ * *py++);
01162 
01163         /* Decrement the loop counter */
01164         k--;
01165       }
01166 
01167       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.   
01168        ** No loop unrolling is used. */
01169       k = srcBLen % 0x4u;
01170 
01171       while(k > 0u)
01172       {
01173         /* Perform the multiply-accumulates */
01174         sum += ((q31_t) * px++ * *py++);
01175 
01176         /* Decrement the loop counter */
01177         k--;
01178       }
01179 
01180       /* Store the result in the accumulator in the destination buffer. */
01181       *pOut = (q15_t) (sum >> 15);
01182       /* Destination pointer is updated according to the address modifier, inc */
01183       pOut += inc;
01184 
01185       /* Increment the pointer pIn1 index, count by 1 */
01186       count++;
01187 
01188       /* Update the inputA and inputB pointers for next MAC calculation */
01189       px = pIn1 + count;
01190       py = pIn2;
01191 
01192       /* Decrement the loop counter */
01193       blkCnt--;
01194     }
01195   }
01196   else
01197   {
01198     /* If the srcBLen is not a multiple of 4,   
01199      * the blockSize2 loop cannot be unrolled by 4 */
01200     blkCnt = blockSize2;
01201 
01202     while(blkCnt > 0u)
01203     {
01204       /* Accumulator is made zero for every iteration */
01205       sum = 0;
01206 
01207       /* Loop over srcBLen */
01208       k = srcBLen;
01209 
01210       while(k > 0u)
01211       {
01212         /* Perform the multiply-accumulate */
01213         sum += ((q31_t) * px++ * *py++);
01214 
01215         /* Decrement the loop counter */
01216         k--;
01217       }
01218 
01219       /* Store the result in the accumulator in the destination buffer. */
01220       *pOut = (q15_t) (sum >> 15);
01221       /* Destination pointer is updated according to the address modifier, inc */
01222       pOut += inc;
01223 
01224       /* Increment the MAC count */
01225       count++;
01226 
01227       /* Update the inputA and inputB pointers for next MAC calculation */
01228       px = pIn1 + count;
01229       py = pIn2;
01230 
01231       /* Decrement the loop counter */
01232       blkCnt--;
01233     }
01234   }
01235 
01236   /* --------------------------   
01237    * Initializations of stage3   
01238    * -------------------------*/
01239 
01240   /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]   
01241    * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]   
01242    * ....   
01243    * sum +=  x[srcALen-2] * y[0] + x[srcALen-1] * y[1]   
01244    * sum +=  x[srcALen-1] * y[0]   
01245    */
01246 
01247   /* In this stage the MAC operations are decreased by 1 for every iteration.   
01248      The count variable holds the number of MAC operations performed */
01249   count = srcBLen - 1u;
01250 
01251   /* Working pointer of inputA */
01252   pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
01253   px = pSrc1;
01254 
01255   /* Working pointer of inputB */
01256   py = pIn2;
01257 
01258   /* -------------------   
01259    * Stage3 process   
01260    * ------------------*/
01261 
01262   while(blockSize3 > 0u)
01263   {
01264     /* Accumulator is made zero for every iteration */
01265     sum = 0;
01266 
01267     /* Apply loop unrolling and compute 4 MACs simultaneously. */
01268     k = count >> 2u;
01269 
01270     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
01271      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
01272     while(k > 0u)
01273     {
01274       /* Perform the multiply-accumulates */
01275         sum += ((q31_t) * px++ * *py++);
01276         sum += ((q31_t) * px++ * *py++);
01277         sum += ((q31_t) * px++ * *py++);
01278         sum += ((q31_t) * px++ * *py++);
01279 
01280       /* Decrement the loop counter */
01281       k--;
01282     }
01283 
01284     /* If the count is not a multiple of 4, compute any remaining MACs here.   
01285      ** No loop unrolling is used. */
01286     k = count % 0x4u;
01287 
01288     while(k > 0u)
01289     {
01290       /* Perform the multiply-accumulates */
01291         sum += ((q31_t) * px++ * *py++);
01292 
01293       /* Decrement the loop counter */
01294       k--;
01295     }
01296 
01297     /* Store the result in the accumulator in the destination buffer. */
01298     *pOut = (q15_t) (sum >> 15);
01299     /* Destination pointer is updated according to the address modifier, inc */
01300     pOut += inc;
01301 
01302     /* Update the inputA and inputB pointers for next MAC calculation */
01303     px = ++pSrc1;
01304     py = pIn2;
01305 
01306     /* Decrement the MAC count */
01307     count--;
01308 
01309     /* Decrement the loop counter */
01310     blockSize3--;
01311   }
01312 
01313 #endif /*   #ifndef UNALIGNED_SUPPORT_DISABLE */
01314 
01315 }
01316 
01317 /**   
01318  * @} end of Corr group   
01319  */