CMSIS DSP library

Dependents:   performance_timer Surfboard_ gps2rtty Capstone ... more

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_correlate_q7.c Source File

arm_correlate_q7.c

00001 /* ----------------------------------------------------------------------   
00002 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.   
00003 *   
00004 * $Date:        19. March 2015
00005 * $Revision:    V.1.4.5
00006 *   
00007 * Project:      CMSIS DSP Library   
00008 * Title:        arm_correlate_q7.c   
00009 *   
00010 * Description:  Correlation of Q7 sequences. 
00011 *   
00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
00013 *  
00014 * Redistribution and use in source and binary forms, with or without 
00015 * modification, are permitted provided that the following conditions
00016 * are met:
00017 *   - Redistributions of source code must retain the above copyright
00018 *     notice, this list of conditions and the following disclaimer.
00019 *   - Redistributions in binary form must reproduce the above copyright
00020 *     notice, this list of conditions and the following disclaimer in
00021 *     the documentation and/or other materials provided with the 
00022 *     distribution.
00023 *   - Neither the name of ARM LIMITED nor the names of its contributors
00024 *     may be used to endorse or promote products derived from this
00025 *     software without specific prior written permission.
00026 *
00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00038 * POSSIBILITY OF SUCH DAMAGE.  
00039 * -------------------------------------------------------------------- */
00040 
00041 #include "arm_math.h"
00042 
00043 /**   
00044  * @ingroup groupFilters   
00045  */
00046 
00047 /**   
00048  * @addtogroup Corr   
00049  * @{   
00050  */
00051 
00052 /**   
00053  * @brief Correlation of Q7 sequences.   
00054  * @param[in] *pSrcA points to the first input sequence.   
00055  * @param[in] srcALen length of the first input sequence.   
00056  * @param[in] *pSrcB points to the second input sequence.   
00057  * @param[in] srcBLen length of the second input sequence.   
00058  * @param[out] *pDst points to the location where the output result is written.  Length 2 * max(srcALen, srcBLen) - 1.   
00059  * @return none.   
00060  *   
00061  * @details   
00062  * <b>Scaling and Overflow Behavior:</b>   
00063  *   
00064  * \par   
00065  * The function is implemented using a 32-bit internal accumulator.   
00066  * Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result.   
00067  * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.   
00068  * This approach provides 17 guard bits and there is no risk of overflow as long as <code>max(srcALen, srcBLen)<131072</code>.   
00069  * The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and saturated to 1.7 format.   
00070  *
00071  * \par    
00072  * Refer the function <code>arm_correlate_opt_q7()</code> for a faster implementation of this function.
00073  * 
00074  */
00075 
00076 void arm_correlate_q7(
00077   q7_t * pSrcA,
00078   uint32_t srcALen,
00079   q7_t * pSrcB,
00080   uint32_t srcBLen,
00081   q7_t * pDst)
00082 {
00083 
00084 
00085 #ifndef ARM_MATH_CM0_FAMILY
00086 
00087   /* Run the below code for Cortex-M4 and Cortex-M3 */
00088 
00089   q7_t *pIn1;                                    /* inputA pointer               */
00090   q7_t *pIn2;                                    /* inputB pointer               */
00091   q7_t *pOut = pDst;                             /* output pointer               */
00092   q7_t *px;                                      /* Intermediate inputA pointer  */
00093   q7_t *py;                                      /* Intermediate inputB pointer  */
00094   q7_t *pSrc1;                                   /* Intermediate pointers        */
00095   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulators                  */
00096   q31_t input1, input2;                          /* temporary variables */
00097   q15_t in1, in2;                                /* temporary variables */
00098   q7_t x0, x1, x2, x3, c0, c1;                   /* temporary variables for holding input and coefficient values */
00099   uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3;  /* loop counter                 */
00100   int32_t inc = 1;
00101 
00102 
00103   /* The algorithm implementation is based on the lengths of the inputs. */
00104   /* srcB is always made to slide across srcA. */
00105   /* So srcBLen is always considered as shorter or equal to srcALen */
00106   /* But CORR(x, y) is reverse of CORR(y, x) */
00107   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
00108   /* and the destination pointer modifier, inc is set to -1 */
00109   /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
00110   /* But to improve the performance,   
00111    * we include zeroes in the output instead of zero padding either of the the inputs*/
00112   /* If srcALen > srcBLen,   
00113    * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
00114   /* If srcALen < srcBLen,   
00115    * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
00116   if(srcALen >= srcBLen)
00117   {
00118     /* Initialization of inputA pointer */
00119     pIn1 = (pSrcA);
00120 
00121     /* Initialization of inputB pointer */
00122     pIn2 = (pSrcB);
00123 
00124     /* Number of output samples is calculated */
00125     outBlockSize = (2u * srcALen) - 1u;
00126 
00127     /* When srcALen > srcBLen, zero padding is done to srcB   
00128      * to make their lengths equal.   
00129      * Instead, (outBlockSize - (srcALen + srcBLen - 1))   
00130      * number of output samples are made zero */
00131     j = outBlockSize - (srcALen + (srcBLen - 1u));
00132 
00133     /* Updating the pointer position to non zero value */
00134     pOut += j;
00135 
00136   }
00137   else
00138   {
00139     /* Initialization of inputA pointer */
00140     pIn1 = (pSrcB);
00141 
00142     /* Initialization of inputB pointer */
00143     pIn2 = (pSrcA);
00144 
00145     /* srcBLen is always considered as shorter or equal to srcALen */
00146     j = srcBLen;
00147     srcBLen = srcALen;
00148     srcALen = j;
00149 
00150     /* CORR(x, y) = Reverse order(CORR(y, x)) */
00151     /* Hence set the destination pointer to point to the last output sample */
00152     pOut = pDst + ((srcALen + srcBLen) - 2u);
00153 
00154     /* Destination address modifier is set to -1 */
00155     inc = -1;
00156 
00157   }
00158 
00159   /* The function is internally   
00160    * divided into three parts according to the number of multiplications that has to be   
00161    * taken place between inputA samples and inputB samples. In the first part of the   
00162    * algorithm, the multiplications increase by one for every iteration.   
00163    * In the second part of the algorithm, srcBLen number of multiplications are done.   
00164    * In the third part of the algorithm, the multiplications decrease by one   
00165    * for every iteration.*/
00166   /* The algorithm is implemented in three stages.   
00167    * The loop counters of each stage is initiated here. */
00168   blockSize1 = srcBLen - 1u;
00169   blockSize2 = srcALen - (srcBLen - 1u);
00170   blockSize3 = blockSize1;
00171 
00172   /* --------------------------   
00173    * Initializations of stage1   
00174    * -------------------------*/
00175 
00176   /* sum = x[0] * y[srcBlen - 1]   
00177    * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]   
00178    * ....   
00179    * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]   
00180    */
00181 
00182   /* In this stage the MAC operations are increased by 1 for every iteration.   
00183      The count variable holds the number of MAC operations performed */
00184   count = 1u;
00185 
00186   /* Working pointer of inputA */
00187   px = pIn1;
00188 
00189   /* Working pointer of inputB */
00190   pSrc1 = pIn2 + (srcBLen - 1u);
00191   py = pSrc1;
00192 
00193   /* ------------------------   
00194    * Stage1 process   
00195    * ----------------------*/
00196 
00197   /* The first stage starts here */
00198   while(blockSize1 > 0u)
00199   {
00200     /* Accumulator is made zero for every iteration */
00201     sum = 0;
00202 
00203     /* Apply loop unrolling and compute 4 MACs simultaneously. */
00204     k = count >> 2;
00205 
00206     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00207      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00208     while(k > 0u)
00209     {
00210       /* x[0] , x[1] */
00211       in1 = (q15_t) * px++;
00212       in2 = (q15_t) * px++;
00213       input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00214 
00215       /* y[srcBLen - 4] , y[srcBLen - 3] */
00216       in1 = (q15_t) * py++;
00217       in2 = (q15_t) * py++;
00218       input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00219 
00220       /* x[0] * y[srcBLen - 4] */
00221       /* x[1] * y[srcBLen - 3] */
00222       sum = __SMLAD(input1, input2, sum);
00223 
00224       /* x[2] , x[3] */
00225       in1 = (q15_t) * px++;
00226       in2 = (q15_t) * px++;
00227       input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00228 
00229       /* y[srcBLen - 2] , y[srcBLen - 1] */
00230       in1 = (q15_t) * py++;
00231       in2 = (q15_t) * py++;
00232       input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00233 
00234       /* x[2] * y[srcBLen - 2] */
00235       /* x[3] * y[srcBLen - 1] */
00236       sum = __SMLAD(input1, input2, sum);
00237 
00238 
00239       /* Decrement the loop counter */
00240       k--;
00241     }
00242 
00243     /* If the count is not a multiple of 4, compute any remaining MACs here.   
00244      ** No loop unrolling is used. */
00245     k = count % 0x4u;
00246 
00247     while(k > 0u)
00248     {
00249       /* Perform the multiply-accumulates */
00250       /* x[0] * y[srcBLen - 1] */
00251       sum += (q31_t) ((q15_t) * px++ * *py++);
00252 
00253       /* Decrement the loop counter */
00254       k--;
00255     }
00256 
00257     /* Store the result in the accumulator in the destination buffer. */
00258     *pOut = (q7_t) (__SSAT(sum >> 7, 8));
00259     /* Destination pointer is updated according to the address modifier, inc */
00260     pOut += inc;
00261 
00262     /* Update the inputA and inputB pointers for next MAC calculation */
00263     py = pSrc1 - count;
00264     px = pIn1;
00265 
00266     /* Increment the MAC count */
00267     count++;
00268 
00269     /* Decrement the loop counter */
00270     blockSize1--;
00271   }
00272 
00273   /* --------------------------   
00274    * Initializations of stage2   
00275    * ------------------------*/
00276 
00277   /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]   
00278    * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]   
00279    * ....   
00280    * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]   
00281    */
00282 
00283   /* Working pointer of inputA */
00284   px = pIn1;
00285 
00286   /* Working pointer of inputB */
00287   py = pIn2;
00288 
00289   /* count is index by which the pointer pIn1 to be incremented */
00290   count = 0u;
00291 
00292   /* -------------------   
00293    * Stage2 process   
00294    * ------------------*/
00295 
00296   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.   
00297    * So, to loop unroll over blockSize2,   
00298    * srcBLen should be greater than or equal to 4 */
00299   if(srcBLen >= 4u)
00300   {
00301     /* Loop unroll over blockSize2, by 4 */
00302     blkCnt = blockSize2 >> 2u;
00303 
00304     while(blkCnt > 0u)
00305     {
00306       /* Set all accumulators to zero */
00307       acc0 = 0;
00308       acc1 = 0;
00309       acc2 = 0;
00310       acc3 = 0;
00311 
00312       /* read x[0], x[1], x[2] samples */
00313       x0 = *px++;
00314       x1 = *px++;
00315       x2 = *px++;
00316 
00317       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00318       k = srcBLen >> 2u;
00319 
00320       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00321        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00322       do
00323       {
00324         /* Read y[0] sample */
00325         c0 = *py++;
00326         /* Read y[1] sample */
00327         c1 = *py++;
00328 
00329         /* Read x[3] sample */
00330         x3 = *px++;
00331 
00332         /* x[0] and x[1] are packed */
00333         in1 = (q15_t) x0;
00334         in2 = (q15_t) x1;
00335 
00336         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00337 
00338         /* y[0] and y[1] are packed */
00339         in1 = (q15_t) c0;
00340         in2 = (q15_t) c1;
00341 
00342         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00343 
00344         /* acc0 += x[0] * y[0] + x[1] * y[1]  */
00345         acc0 = __SMLAD(input1, input2, acc0);
00346 
00347         /* x[1] and x[2] are packed */
00348         in1 = (q15_t) x1;
00349         in2 = (q15_t) x2;
00350 
00351         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00352 
00353         /* acc1 += x[1] * y[0] + x[2] * y[1] */
00354         acc1 = __SMLAD(input1, input2, acc1);
00355 
00356         /* x[2] and x[3] are packed */
00357         in1 = (q15_t) x2;
00358         in2 = (q15_t) x3;
00359 
00360         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00361 
00362         /* acc2 += x[2] * y[0] + x[3] * y[1]  */
00363         acc2 = __SMLAD(input1, input2, acc2);
00364 
00365         /* Read x[4] sample */
00366         x0 = *(px++);
00367 
00368         /* x[3] and x[4] are packed */
00369         in1 = (q15_t) x3;
00370         in2 = (q15_t) x0;
00371 
00372         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00373 
00374         /* acc3 += x[3] * y[0] + x[4] * y[1]  */
00375         acc3 = __SMLAD(input1, input2, acc3);
00376 
00377         /* Read y[2] sample */
00378         c0 = *py++;
00379         /* Read y[3] sample */
00380         c1 = *py++;
00381 
00382         /* Read x[5] sample */
00383         x1 = *px++;
00384 
00385         /* x[2] and x[3] are packed */
00386         in1 = (q15_t) x2;
00387         in2 = (q15_t) x3;
00388 
00389         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00390 
00391         /* y[2] and y[3] are packed */
00392         in1 = (q15_t) c0;
00393         in2 = (q15_t) c1;
00394 
00395         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00396 
00397         /* acc0 += x[2] * y[2] + x[3] * y[3]  */
00398         acc0 = __SMLAD(input1, input2, acc0);
00399 
00400         /* x[3] and x[4] are packed */
00401         in1 = (q15_t) x3;
00402         in2 = (q15_t) x0;
00403 
00404         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00405 
00406         /* acc1 += x[3] * y[2] + x[4] * y[3]  */
00407         acc1 = __SMLAD(input1, input2, acc1);
00408 
00409         /* x[4] and x[5] are packed */
00410         in1 = (q15_t) x0;
00411         in2 = (q15_t) x1;
00412 
00413         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00414 
00415         /* acc2 += x[4] * y[2] + x[5] * y[3]  */
00416         acc2 = __SMLAD(input1, input2, acc2);
00417 
00418         /* Read x[6] sample */
00419         x2 = *px++;
00420 
00421         /* x[5] and x[6] are packed */
00422         in1 = (q15_t) x1;
00423         in2 = (q15_t) x2;
00424 
00425         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00426 
00427         /* acc3 += x[5] * y[2] + x[6] * y[3]  */
00428         acc3 = __SMLAD(input1, input2, acc3);
00429 
00430       } while(--k);
00431 
00432       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.   
00433        ** No loop unrolling is used. */
00434       k = srcBLen % 0x4u;
00435 
00436       while(k > 0u)
00437       {
00438         /* Read y[4] sample */
00439         c0 = *py++;
00440 
00441         /* Read x[7] sample */
00442         x3 = *px++;
00443 
00444         /* Perform the multiply-accumulates */
00445         /* acc0 +=  x[4] * y[4] */
00446         acc0 += ((q15_t) x0 * c0);
00447         /* acc1 +=  x[5] * y[4] */
00448         acc1 += ((q15_t) x1 * c0);
00449         /* acc2 +=  x[6] * y[4] */
00450         acc2 += ((q15_t) x2 * c0);
00451         /* acc3 +=  x[7] * y[4] */
00452         acc3 += ((q15_t) x3 * c0);
00453 
00454         /* Reuse the present samples for the next MAC */
00455         x0 = x1;
00456         x1 = x2;
00457         x2 = x3;
00458 
00459         /* Decrement the loop counter */
00460         k--;
00461       }
00462 
00463       /* Store the result in the accumulator in the destination buffer. */
00464       *pOut = (q7_t) (__SSAT(acc0 >> 7, 8));
00465       /* Destination pointer is updated according to the address modifier, inc */
00466       pOut += inc;
00467 
00468       *pOut = (q7_t) (__SSAT(acc1 >> 7, 8));
00469       pOut += inc;
00470 
00471       *pOut = (q7_t) (__SSAT(acc2 >> 7, 8));
00472       pOut += inc;
00473 
00474       *pOut = (q7_t) (__SSAT(acc3 >> 7, 8));
00475       pOut += inc;
00476 
00477       count += 4u;
00478       /* Update the inputA and inputB pointers for next MAC calculation */
00479       px = pIn1 + count;
00480       py = pIn2;
00481 
00482       /* Decrement the loop counter */
00483       blkCnt--;
00484     }
00485 
00486     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.   
00487      ** No loop unrolling is used. */
00488     blkCnt = blockSize2 % 0x4u;
00489 
00490     while(blkCnt > 0u)
00491     {
00492       /* Accumulator is made zero for every iteration */
00493       sum = 0;
00494 
00495       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00496       k = srcBLen >> 2u;
00497 
00498       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00499        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00500       while(k > 0u)
00501       {
00502         /* Reading two inputs of SrcA buffer and packing */
00503         in1 = (q15_t) * px++;
00504         in2 = (q15_t) * px++;
00505         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00506 
00507         /* Reading two inputs of SrcB buffer and packing */
00508         in1 = (q15_t) * py++;
00509         in2 = (q15_t) * py++;
00510         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00511 
00512         /* Perform the multiply-accumulates */
00513         sum = __SMLAD(input1, input2, sum);
00514 
00515         /* Reading two inputs of SrcA buffer and packing */
00516         in1 = (q15_t) * px++;
00517         in2 = (q15_t) * px++;
00518         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00519 
00520         /* Reading two inputs of SrcB buffer and packing */
00521         in1 = (q15_t) * py++;
00522         in2 = (q15_t) * py++;
00523         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00524 
00525         /* Perform the multiply-accumulates */
00526         sum = __SMLAD(input1, input2, sum);
00527 
00528         /* Decrement the loop counter */
00529         k--;
00530       }
00531 
00532       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.   
00533        ** No loop unrolling is used. */
00534       k = srcBLen % 0x4u;
00535 
00536       while(k > 0u)
00537       {
00538         /* Perform the multiply-accumulates */
00539         sum += ((q15_t) * px++ * *py++);
00540 
00541         /* Decrement the loop counter */
00542         k--;
00543       }
00544 
00545       /* Store the result in the accumulator in the destination buffer. */
00546       *pOut = (q7_t) (__SSAT(sum >> 7, 8));
00547       /* Destination pointer is updated according to the address modifier, inc */
00548       pOut += inc;
00549 
00550       /* Increment the pointer pIn1 index, count by 1 */
00551       count++;
00552 
00553       /* Update the inputA and inputB pointers for next MAC calculation */
00554       px = pIn1 + count;
00555       py = pIn2;
00556 
00557       /* Decrement the loop counter */
00558       blkCnt--;
00559     }
00560   }
00561   else
00562   {
00563     /* If the srcBLen is not a multiple of 4,   
00564      * the blockSize2 loop cannot be unrolled by 4 */
00565     blkCnt = blockSize2;
00566 
00567     while(blkCnt > 0u)
00568     {
00569       /* Accumulator is made zero for every iteration */
00570       sum = 0;
00571 
00572       /* Loop over srcBLen */
00573       k = srcBLen;
00574 
00575       while(k > 0u)
00576       {
00577         /* Perform the multiply-accumulate */
00578         sum += ((q15_t) * px++ * *py++);
00579 
00580         /* Decrement the loop counter */
00581         k--;
00582       }
00583 
00584       /* Store the result in the accumulator in the destination buffer. */
00585       *pOut = (q7_t) (__SSAT(sum >> 7, 8));
00586       /* Destination pointer is updated according to the address modifier, inc */
00587       pOut += inc;
00588 
00589       /* Increment the MAC count */
00590       count++;
00591 
00592       /* Update the inputA and inputB pointers for next MAC calculation */
00593       px = pIn1 + count;
00594       py = pIn2;
00595 
00596 
00597       /* Decrement the loop counter */
00598       blkCnt--;
00599     }
00600   }
00601 
00602   /* --------------------------   
00603    * Initializations of stage3   
00604    * -------------------------*/
00605 
00606   /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]   
00607    * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]   
00608    * ....   
00609    * sum +=  x[srcALen-2] * y[0] + x[srcALen-1] * y[1]   
00610    * sum +=  x[srcALen-1] * y[0]   
00611    */
00612 
00613   /* In this stage the MAC operations are decreased by 1 for every iteration.   
00614      The count variable holds the number of MAC operations performed */
00615   count = srcBLen - 1u;
00616 
00617   /* Working pointer of inputA */
00618   pSrc1 = pIn1 + (srcALen - (srcBLen - 1u));
00619   px = pSrc1;
00620 
00621   /* Working pointer of inputB */
00622   py = pIn2;
00623 
00624   /* -------------------   
00625    * Stage3 process   
00626    * ------------------*/
00627 
00628   while(blockSize3 > 0u)
00629   {
00630     /* Accumulator is made zero for every iteration */
00631     sum = 0;
00632 
00633     /* Apply loop unrolling and compute 4 MACs simultaneously. */
00634     k = count >> 2u;
00635 
00636     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00637      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00638     while(k > 0u)
00639     {
00640       /* x[srcALen - srcBLen + 1] , x[srcALen - srcBLen + 2]  */
00641       in1 = (q15_t) * px++;
00642       in2 = (q15_t) * px++;
00643       input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00644 
00645       /* y[0] , y[1] */
00646       in1 = (q15_t) * py++;
00647       in2 = (q15_t) * py++;
00648       input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00649 
00650       /* sum += x[srcALen - srcBLen + 1] * y[0] */
00651       /* sum += x[srcALen - srcBLen + 2] * y[1] */
00652       sum = __SMLAD(input1, input2, sum);
00653 
00654       /* x[srcALen - srcBLen + 3] , x[srcALen - srcBLen + 4] */
00655       in1 = (q15_t) * px++;
00656       in2 = (q15_t) * px++;
00657       input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00658 
00659       /* y[2] , y[3] */
00660       in1 = (q15_t) * py++;
00661       in2 = (q15_t) * py++;
00662       input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00663 
00664       /* sum += x[srcALen - srcBLen + 3] * y[2] */
00665       /* sum += x[srcALen - srcBLen + 4] * y[3] */
00666       sum = __SMLAD(input1, input2, sum);
00667 
00668       /* Decrement the loop counter */
00669       k--;
00670     }
00671 
00672     /* If the count is not a multiple of 4, compute any remaining MACs here.   
00673      ** No loop unrolling is used. */
00674     k = count % 0x4u;
00675 
00676     while(k > 0u)
00677     {
00678       /* Perform the multiply-accumulates */
00679       sum += ((q15_t) * px++ * *py++);
00680 
00681       /* Decrement the loop counter */
00682       k--;
00683     }
00684 
00685     /* Store the result in the accumulator in the destination buffer. */
00686     *pOut = (q7_t) (__SSAT(sum >> 7, 8));
00687     /* Destination pointer is updated according to the address modifier, inc */
00688     pOut += inc;
00689 
00690     /* Update the inputA and inputB pointers for next MAC calculation */
00691     px = ++pSrc1;
00692     py = pIn2;
00693 
00694     /* Decrement the MAC count */
00695     count--;
00696 
00697     /* Decrement the loop counter */
00698     blockSize3--;
00699   }
00700 
00701 #else
00702 
00703 /* Run the below code for Cortex-M0 */
00704 
00705   q7_t *pIn1 = pSrcA;                            /* inputA pointer */
00706   q7_t *pIn2 = pSrcB + (srcBLen - 1u);           /* inputB pointer */
00707   q31_t sum;                                     /* Accumulator */
00708   uint32_t i = 0u, j;                            /* loop counters */
00709   uint32_t inv = 0u;                             /* Reverse order flag */
00710   uint32_t tot = 0u;                             /* Length */
00711 
00712   /* The algorithm implementation is based on the lengths of the inputs. */
00713   /* srcB is always made to slide across srcA. */
00714   /* So srcBLen is always considered as shorter or equal to srcALen */
00715   /* But CORR(x, y) is reverse of CORR(y, x) */
00716   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
00717   /* and a varaible, inv is set to 1 */
00718   /* If lengths are not equal then zero pad has to be done to  make the two   
00719    * inputs of same length. But to improve the performance, we include zeroes   
00720    * in the output instead of zero padding either of the the inputs*/
00721   /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the   
00722    * starting of the output buffer */
00723   /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the  
00724    * ending of the output buffer */
00725   /* Once the zero padding is done the remaining of the output is calcualted  
00726    * using convolution but with the shorter signal time shifted. */
00727 
00728   /* Calculate the length of the remaining sequence */
00729   tot = ((srcALen + srcBLen) - 2u);
00730 
00731   if(srcALen > srcBLen)
00732   {
00733     /* Calculating the number of zeros to be padded to the output */
00734     j = srcALen - srcBLen;
00735 
00736     /* Initialise the pointer after zero padding */
00737     pDst += j;
00738   }
00739 
00740   else if(srcALen < srcBLen)
00741   {
00742     /* Initialization to inputB pointer */
00743     pIn1 = pSrcB;
00744 
00745     /* Initialization to the end of inputA pointer */
00746     pIn2 = pSrcA + (srcALen - 1u);
00747 
00748     /* Initialisation of the pointer after zero padding */
00749     pDst = pDst + tot;
00750 
00751     /* Swapping the lengths */
00752     j = srcALen;
00753     srcALen = srcBLen;
00754     srcBLen = j;
00755 
00756     /* Setting the reverse flag */
00757     inv = 1;
00758 
00759   }
00760 
00761   /* Loop to calculate convolution for output length number of times */
00762   for (i = 0u; i <= tot; i++)
00763   {
00764     /* Initialize sum with zero to carry on MAC operations */
00765     sum = 0;
00766 
00767     /* Loop to perform MAC operations according to convolution equation */
00768     for (j = 0u; j <= i; j++)
00769     {
00770       /* Check the array limitations */
00771       if((((i - j) < srcBLen) && (j < srcALen)))
00772       {
00773         /* z[i] += x[i-j] * y[j] */
00774         sum += ((q15_t) pIn1[j] * pIn2[-((int32_t) i - j)]);
00775       }
00776     }
00777     /* Store the output in the destination buffer */
00778     if(inv == 1)
00779       *pDst-- = (q7_t) __SSAT((sum >> 7u), 8u);
00780     else
00781       *pDst++ = (q7_t) __SSAT((sum >> 7u), 8u);
00782   }
00783 
00784 #endif /*   #ifndef ARM_MATH_CM0_FAMILY */
00785 
00786 }
00787 
00788 /**   
00789  * @} end of Corr group   
00790  */