CMSIS DSP library

Dependents:   KL25Z_FFT_Demo Hat_Board_v5_1 KL25Z_FFT_Demo_tony KL25Z_FFT_Demo_tony ... more

Fork of mbed-dsp by mbed official

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_conv_q7.c Source File

arm_conv_q7.c

00001 /* ----------------------------------------------------------------------   
00002 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.   
00003 *   
00004 * $Date:        17. January 2013
00005 * $Revision:    V1.4.1
00006 *   
00007 * Project:      CMSIS DSP Library   
00008 * Title:        arm_conv_q7.c   
00009 *   
00010 * Description:  Convolution of Q7 sequences. 
00011 *   
00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
00013 *  
00014 * Redistribution and use in source and binary forms, with or without 
00015 * modification, are permitted provided that the following conditions
00016 * are met:
00017 *   - Redistributions of source code must retain the above copyright
00018 *     notice, this list of conditions and the following disclaimer.
00019 *   - Redistributions in binary form must reproduce the above copyright
00020 *     notice, this list of conditions and the following disclaimer in
00021 *     the documentation and/or other materials provided with the 
00022 *     distribution.
00023 *   - Neither the name of ARM LIMITED nor the names of its contributors
00024 *     may be used to endorse or promote products derived from this
00025 *     software without specific prior written permission.
00026 *
00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00038 * POSSIBILITY OF SUCH DAMAGE.    
00039 * -------------------------------------------------------------------- */
00040 
00041 #include "arm_math.h"
00042 
00043 /**   
00044  * @ingroup groupFilters   
00045  */
00046 
00047 /**   
00048  * @addtogroup Conv   
00049  * @{   
00050  */
00051 
00052 /**   
00053  * @brief Convolution of Q7 sequences.   
00054  * @param[in] *pSrcA points to the first input sequence.   
00055  * @param[in] srcALen length of the first input sequence.   
00056  * @param[in] *pSrcB points to the second input sequence.   
00057  * @param[in] srcBLen length of the second input sequence.   
00058  * @param[out] *pDst points to the location where the output result is written.  Length srcALen+srcBLen-1.   
00059  * @return none.   
00060  *   
00061  * @details   
00062  * <b>Scaling and Overflow Behavior:</b>   
00063  *   
00064  * \par   
00065  * The function is implemented using a 32-bit internal accumulator.   
00066  * Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result.   
00067  * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.   
00068  * This approach provides 17 guard bits and there is no risk of overflow as long as <code>max(srcALen, srcBLen)<131072</code>.   
00069  * The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and then saturated to 1.7 format.   
00070  *
00071  * \par    
00072  * Refer the function <code>arm_conv_opt_q7()</code> for a faster implementation of this function.
00073  * 
00074  */
00075 
00076 void arm_conv_q7(
00077   q7_t * pSrcA,
00078   uint32_t srcALen,
00079   q7_t * pSrcB,
00080   uint32_t srcBLen,
00081   q7_t * pDst)
00082 {
00083 
00084 
00085 #ifndef ARM_MATH_CM0_FAMILY
00086 
00087   /* Run the below code for Cortex-M4 and Cortex-M3 */
00088 
00089   q7_t *pIn1;                                    /* inputA pointer */
00090   q7_t *pIn2;                                    /* inputB pointer */
00091   q7_t *pOut = pDst;                             /* output pointer */
00092   q7_t *px;                                      /* Intermediate inputA pointer */
00093   q7_t *py;                                      /* Intermediate inputB pointer */
00094   q7_t *pSrc1, *pSrc2;                           /* Intermediate pointers */
00095   q7_t x0, x1, x2, x3, c0, c1;                   /* Temporary variables to hold state and coefficient values */
00096   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulator */
00097   q31_t input1, input2;                          /* Temporary input variables */
00098   q15_t in1, in2;                                /* Temporary input variables */
00099   uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3;     /* loop counter */
00100 
00101   /* The algorithm implementation is based on the lengths of the inputs. */
00102   /* srcB is always made to slide across srcA. */
00103   /* So srcBLen is always considered as shorter or equal to srcALen */
00104   if(srcALen >= srcBLen)
00105   {
00106     /* Initialization of inputA pointer */
00107     pIn1 = pSrcA;
00108 
00109     /* Initialization of inputB pointer */
00110     pIn2 = pSrcB;
00111   }
00112   else
00113   {
00114     /* Initialization of inputA pointer */
00115     pIn1 = pSrcB;
00116 
00117     /* Initialization of inputB pointer */
00118     pIn2 = pSrcA;
00119 
00120     /* srcBLen is always considered as shorter or equal to srcALen */
00121     j = srcBLen;
00122     srcBLen = srcALen;
00123     srcALen = j;
00124   }
00125 
00126   /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
00127   /* The function is internally   
00128    * divided into three stages according to the number of multiplications that has to be   
00129    * taken place between inputA samples and inputB samples. In the first stage of the   
00130    * algorithm, the multiplications increase by one for every iteration.   
00131    * In the second stage of the algorithm, srcBLen number of multiplications are done.   
00132    * In the third stage of the algorithm, the multiplications decrease by one   
00133    * for every iteration. */
00134 
00135   /* The algorithm is implemented in three stages.   
00136      The loop counters of each stage is initiated here. */
00137   blockSize1 = srcBLen - 1u;
00138   blockSize2 = (srcALen - srcBLen) + 1u;
00139   blockSize3 = blockSize1;
00140 
00141   /* --------------------------   
00142    * Initializations of stage1   
00143    * -------------------------*/
00144 
00145   /* sum = x[0] * y[0]   
00146    * sum = x[0] * y[1] + x[1] * y[0]   
00147    * ....   
00148    * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]   
00149    */
00150 
00151   /* In this stage the MAC operations are increased by 1 for every iteration.   
00152      The count variable holds the number of MAC operations performed */
00153   count = 1u;
00154 
00155   /* Working pointer of inputA */
00156   px = pIn1;
00157 
00158   /* Working pointer of inputB */
00159   py = pIn2;
00160 
00161 
00162   /* ------------------------   
00163    * Stage1 process   
00164    * ----------------------*/
00165 
00166   /* The first stage starts here */
00167   while(blockSize1 > 0u)
00168   {
00169     /* Accumulator is made zero for every iteration */
00170     sum = 0;
00171 
00172     /* Apply loop unrolling and compute 4 MACs simultaneously. */
00173     k = count >> 2u;
00174 
00175     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00176      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00177     while(k > 0u)
00178     {
00179       /* x[0] , x[1] */
00180       in1 = (q15_t) * px++;
00181       in2 = (q15_t) * px++;
00182       input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
00183 
00184       /* y[srcBLen - 1] , y[srcBLen - 2] */
00185       in1 = (q15_t) * py--;
00186       in2 = (q15_t) * py--;
00187       input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
00188 
00189       /* x[0] * y[srcBLen - 1] */
00190       /* x[1] * y[srcBLen - 2] */
00191       sum = __SMLAD(input1, input2, sum);
00192 
00193       /* x[2] , x[3] */
00194       in1 = (q15_t) * px++;
00195       in2 = (q15_t) * px++;
00196       input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
00197 
00198       /* y[srcBLen - 3] , y[srcBLen - 4] */
00199       in1 = (q15_t) * py--;
00200       in2 = (q15_t) * py--;
00201       input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
00202 
00203       /* x[2] * y[srcBLen - 3] */
00204       /* x[3] * y[srcBLen - 4] */
00205       sum = __SMLAD(input1, input2, sum);
00206 
00207       /* Decrement the loop counter */
00208       k--;
00209     }
00210 
00211     /* If the count is not a multiple of 4, compute any remaining MACs here.   
00212      ** No loop unrolling is used. */
00213     k = count % 0x4u;
00214 
00215     while(k > 0u)
00216     {
00217       /* Perform the multiply-accumulates */
00218       sum += ((q15_t) * px++ * *py--);
00219 
00220       /* Decrement the loop counter */
00221       k--;
00222     }
00223 
00224     /* Store the result in the accumulator in the destination buffer. */
00225     *pOut++ = (q7_t) (__SSAT(sum >> 7u, 8));
00226 
00227     /* Update the inputA and inputB pointers for next MAC calculation */
00228     py = pIn2 + count;
00229     px = pIn1;
00230 
00231     /* Increment the MAC count */
00232     count++;
00233 
00234     /* Decrement the loop counter */
00235     blockSize1--;
00236   }
00237 
00238   /* --------------------------   
00239    * Initializations of stage2   
00240    * ------------------------*/
00241 
00242   /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]   
00243    * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]   
00244    * ....   
00245    * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]   
00246    */
00247 
00248   /* Working pointer of inputA */
00249   px = pIn1;
00250 
00251   /* Working pointer of inputB */
00252   pSrc2 = pIn2 + (srcBLen - 1u);
00253   py = pSrc2;
00254 
00255   /* count is index by which the pointer pIn1 to be incremented */
00256   count = 0u;
00257 
00258   /* -------------------   
00259    * Stage2 process   
00260    * ------------------*/
00261 
00262   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.   
00263    * So, to loop unroll over blockSize2,   
00264    * srcBLen should be greater than or equal to 4 */
00265   if(srcBLen >= 4u)
00266   {
00267     /* Loop unroll over blockSize2, by 4 */
00268     blkCnt = blockSize2 >> 2u;
00269 
00270     while(blkCnt > 0u)
00271     {
00272       /* Set all accumulators to zero */
00273       acc0 = 0;
00274       acc1 = 0;
00275       acc2 = 0;
00276       acc3 = 0;
00277 
00278       /* read x[0], x[1], x[2] samples */
00279       x0 = *(px++);
00280       x1 = *(px++);
00281       x2 = *(px++);
00282 
00283       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00284       k = srcBLen >> 2u;
00285 
00286       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00287        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00288       do
00289       {
00290         /* Read y[srcBLen - 1] sample */
00291         c0 = *(py--);
00292         /* Read y[srcBLen - 2] sample */
00293         c1 = *(py--);
00294 
00295         /* Read x[3] sample */
00296         x3 = *(px++);
00297 
00298         /* x[0] and x[1] are packed */
00299         in1 = (q15_t) x0;
00300         in2 = (q15_t) x1;
00301 
00302         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
00303 
00304         /* y[srcBLen - 1]   and y[srcBLen - 2] are packed */
00305         in1 = (q15_t) c0;
00306         in2 = (q15_t) c1;
00307 
00308         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
00309 
00310         /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2]  */
00311         acc0 = __SMLAD(input1, input2, acc0);
00312 
00313         /* x[1] and x[2] are packed */
00314         in1 = (q15_t) x1;
00315         in2 = (q15_t) x2;
00316 
00317         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
00318 
00319         /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2]  */
00320         acc1 = __SMLAD(input1, input2, acc1);
00321 
00322         /* x[2] and x[3] are packed */
00323         in1 = (q15_t) x2;
00324         in2 = (q15_t) x3;
00325 
00326         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
00327 
00328         /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2]  */
00329         acc2 = __SMLAD(input1, input2, acc2);
00330 
00331         /* Read x[4] sample */
00332         x0 = *(px++);
00333 
00334         /* x[3] and x[4] are packed */
00335         in1 = (q15_t) x3;
00336         in2 = (q15_t) x0;
00337 
00338         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
00339 
00340         /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2]  */
00341         acc3 = __SMLAD(input1, input2, acc3);
00342 
00343         /* Read y[srcBLen - 3] sample */
00344         c0 = *(py--);
00345         /* Read y[srcBLen - 4] sample */
00346         c1 = *(py--);
00347 
00348         /* Read x[5] sample */
00349         x1 = *(px++);
00350 
00351         /* x[2] and x[3] are packed */
00352         in1 = (q15_t) x2;
00353         in2 = (q15_t) x3;
00354 
00355         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
00356 
00357         /* y[srcBLen - 3] and y[srcBLen - 4] are packed */
00358         in1 = (q15_t) c0;
00359         in2 = (q15_t) c1;
00360 
00361         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
00362 
00363         /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4]  */
00364         acc0 = __SMLAD(input1, input2, acc0);
00365 
00366         /* x[3] and x[4] are packed */
00367         in1 = (q15_t) x3;
00368         in2 = (q15_t) x0;
00369 
00370         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
00371 
00372         /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4]  */
00373         acc1 = __SMLAD(input1, input2, acc1);
00374 
00375         /* x[4] and x[5] are packed */
00376         in1 = (q15_t) x0;
00377         in2 = (q15_t) x1;
00378 
00379         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
00380 
00381         /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4]  */
00382         acc2 = __SMLAD(input1, input2, acc2);
00383 
00384         /* Read x[6] sample */
00385         x2 = *(px++);
00386 
00387         /* x[5] and x[6] are packed */
00388         in1 = (q15_t) x1;
00389         in2 = (q15_t) x2;
00390 
00391         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
00392 
00393         /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4]  */
00394         acc3 = __SMLAD(input1, input2, acc3);
00395 
00396       } while(--k);
00397 
00398       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.   
00399        ** No loop unrolling is used. */
00400       k = srcBLen % 0x4u;
00401 
00402       while(k > 0u)
00403       {
00404         /* Read y[srcBLen - 5] sample */
00405         c0 = *(py--);
00406 
00407         /* Read x[7] sample */
00408         x3 = *(px++);
00409 
00410         /* Perform the multiply-accumulates */
00411         /* acc0 +=  x[4] * y[srcBLen - 5] */
00412         acc0 += ((q15_t) x0 * c0);
00413         /* acc1 +=  x[5] * y[srcBLen - 5] */
00414         acc1 += ((q15_t) x1 * c0);
00415         /* acc2 +=  x[6] * y[srcBLen - 5] */
00416         acc2 += ((q15_t) x2 * c0);
00417         /* acc3 +=  x[7] * y[srcBLen - 5] */
00418         acc3 += ((q15_t) x3 * c0);
00419 
00420         /* Reuse the present samples for the next MAC */
00421         x0 = x1;
00422         x1 = x2;
00423         x2 = x3;
00424 
00425         /* Decrement the loop counter */
00426         k--;
00427       }
00428 
00429 
00430       /* Store the result in the accumulator in the destination buffer. */
00431       *pOut++ = (q7_t) (__SSAT(acc0 >> 7u, 8));
00432       *pOut++ = (q7_t) (__SSAT(acc1 >> 7u, 8));
00433       *pOut++ = (q7_t) (__SSAT(acc2 >> 7u, 8));
00434       *pOut++ = (q7_t) (__SSAT(acc3 >> 7u, 8));
00435 
00436       /* Increment the pointer pIn1 index, count by 4 */
00437       count += 4u;
00438 
00439       /* Update the inputA and inputB pointers for next MAC calculation */
00440       px = pIn1 + count;
00441       py = pSrc2;
00442 
00443       /* Decrement the loop counter */
00444       blkCnt--;
00445     }
00446 
00447     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.   
00448      ** No loop unrolling is used. */
00449     blkCnt = blockSize2 % 0x4u;
00450 
00451     while(blkCnt > 0u)
00452     {
00453       /* Accumulator is made zero for every iteration */
00454       sum = 0;
00455 
00456       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00457       k = srcBLen >> 2u;
00458 
00459       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00460        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00461       while(k > 0u)
00462       {
00463 
00464         /* Reading two inputs of SrcA buffer and packing */
00465         in1 = (q15_t) * px++;
00466         in2 = (q15_t) * px++;
00467         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
00468 
00469         /* Reading two inputs of SrcB buffer and packing */
00470         in1 = (q15_t) * py--;
00471         in2 = (q15_t) * py--;
00472         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
00473 
00474         /* Perform the multiply-accumulates */
00475         sum = __SMLAD(input1, input2, sum);
00476 
00477         /* Reading two inputs of SrcA buffer and packing */
00478         in1 = (q15_t) * px++;
00479         in2 = (q15_t) * px++;
00480         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
00481 
00482         /* Reading two inputs of SrcB buffer and packing */
00483         in1 = (q15_t) * py--;
00484         in2 = (q15_t) * py--;
00485         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
00486 
00487         /* Perform the multiply-accumulates */
00488         sum = __SMLAD(input1, input2, sum);
00489 
00490         /* Decrement the loop counter */
00491         k--;
00492       }
00493 
00494       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.   
00495        ** No loop unrolling is used. */
00496       k = srcBLen % 0x4u;
00497 
00498       while(k > 0u)
00499       {
00500         /* Perform the multiply-accumulates */
00501         sum += ((q15_t) * px++ * *py--);
00502 
00503         /* Decrement the loop counter */
00504         k--;
00505       }
00506 
00507       /* Store the result in the accumulator in the destination buffer. */
00508       *pOut++ = (q7_t) (__SSAT(sum >> 7u, 8));
00509 
00510       /* Increment the pointer pIn1 index, count by 1 */
00511       count++;
00512 
00513       /* Update the inputA and inputB pointers for next MAC calculation */
00514       px = pIn1 + count;
00515       py = pSrc2;
00516 
00517       /* Decrement the loop counter */
00518       blkCnt--;
00519     }
00520   }
00521   else
00522   {
00523     /* If the srcBLen is not a multiple of 4,   
00524      * the blockSize2 loop cannot be unrolled by 4 */
00525     blkCnt = blockSize2;
00526 
00527     while(blkCnt > 0u)
00528     {
00529       /* Accumulator is made zero for every iteration */
00530       sum = 0;
00531 
00532       /* srcBLen number of MACS should be performed */
00533       k = srcBLen;
00534 
00535       while(k > 0u)
00536       {
00537         /* Perform the multiply-accumulate */
00538         sum += ((q15_t) * px++ * *py--);
00539 
00540         /* Decrement the loop counter */
00541         k--;
00542       }
00543 
00544       /* Store the result in the accumulator in the destination buffer. */
00545       *pOut++ = (q7_t) (__SSAT(sum >> 7u, 8));
00546 
00547       /* Increment the MAC count */
00548       count++;
00549 
00550       /* Update the inputA and inputB pointers for next MAC calculation */
00551       px = pIn1 + count;
00552       py = pSrc2;
00553 
00554       /* Decrement the loop counter */
00555       blkCnt--;
00556     }
00557   }
00558 
00559 
00560   /* --------------------------   
00561    * Initializations of stage3   
00562    * -------------------------*/
00563 
00564   /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]   
00565    * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]   
00566    * ....   
00567    * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]   
00568    * sum +=  x[srcALen-1] * y[srcBLen-1]   
00569    */
00570 
00571   /* In this stage the MAC operations are decreased by 1 for every iteration.   
00572      The blockSize3 variable holds the number of MAC operations performed */
00573 
00574   /* Working pointer of inputA */
00575   pSrc1 = pIn1 + (srcALen - (srcBLen - 1u));
00576   px = pSrc1;
00577 
00578   /* Working pointer of inputB */
00579   pSrc2 = pIn2 + (srcBLen - 1u);
00580   py = pSrc2;
00581 
00582   /* -------------------   
00583    * Stage3 process   
00584    * ------------------*/
00585 
00586   while(blockSize3 > 0u)
00587   {
00588     /* Accumulator is made zero for every iteration */
00589     sum = 0;
00590 
00591     /* Apply loop unrolling and compute 4 MACs simultaneously. */
00592     k = blockSize3 >> 2u;
00593 
00594     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00595      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00596     while(k > 0u)
00597     {
00598       /* Reading two inputs, x[srcALen - srcBLen + 1] and x[srcALen - srcBLen + 2] of SrcA buffer and packing */
00599       in1 = (q15_t) * px++;
00600       in2 = (q15_t) * px++;
00601       input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
00602 
00603       /* Reading two inputs, y[srcBLen - 1] and y[srcBLen - 2] of SrcB buffer and packing */
00604       in1 = (q15_t) * py--;
00605       in2 = (q15_t) * py--;
00606       input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
00607 
00608       /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
00609       /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
00610       sum = __SMLAD(input1, input2, sum);
00611 
00612       /* Reading two inputs, x[srcALen - srcBLen + 3] and x[srcALen - srcBLen + 4] of SrcA buffer and packing */
00613       in1 = (q15_t) * px++;
00614       in2 = (q15_t) * px++;
00615       input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
00616 
00617       /* Reading two inputs, y[srcBLen - 3] and y[srcBLen - 4] of SrcB buffer and packing */
00618       in1 = (q15_t) * py--;
00619       in2 = (q15_t) * py--;
00620       input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
00621 
00622       /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
00623       /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
00624       sum = __SMLAD(input1, input2, sum);
00625 
00626       /* Decrement the loop counter */
00627       k--;
00628     }
00629 
00630     /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.   
00631      ** No loop unrolling is used. */
00632     k = blockSize3 % 0x4u;
00633 
00634     while(k > 0u)
00635     {
00636       /* Perform the multiply-accumulates */
00637       sum += ((q15_t) * px++ * *py--);
00638 
00639       /* Decrement the loop counter */
00640       k--;
00641     }
00642 
00643     /* Store the result in the accumulator in the destination buffer. */
00644     *pOut++ = (q7_t) (__SSAT(sum >> 7u, 8));
00645 
00646     /* Update the inputA and inputB pointers for next MAC calculation */
00647     px = ++pSrc1;
00648     py = pSrc2;
00649 
00650     /* Decrement the loop counter */
00651     blockSize3--;
00652   }
00653 
00654 #else
00655 
00656   /* Run the below code for Cortex-M0 */
00657 
00658   q7_t *pIn1 = pSrcA;                            /* input pointer */
00659   q7_t *pIn2 = pSrcB;                            /* coefficient pointer */
00660   q31_t sum;                                     /* Accumulator */
00661   uint32_t i, j;                                 /* loop counter */
00662 
00663   /* Loop to calculate output of convolution for output length number of times */
00664   for (i = 0; i < (srcALen + srcBLen - 1); i++)
00665   {
00666     /* Initialize sum with zero to carry on MAC operations */
00667     sum = 0;
00668 
00669     /* Loop to perform MAC operations according to convolution equation */
00670     for (j = 0; j <= i; j++)
00671     {
00672       /* Check the array limitations */
00673       if(((i - j) < srcBLen) && (j < srcALen))
00674       {
00675         /* z[i] += x[i-j] * y[j] */
00676         sum += (q15_t) pIn1[j] * (pIn2[i - j]);
00677       }
00678     }
00679 
00680     /* Store the output in the destination buffer */
00681     pDst[i] = (q7_t) __SSAT((sum >> 7u), 8u);
00682   }
00683 
00684 #endif /*   #ifndef ARM_MATH_CM0_FAMILY        */
00685 
00686 }
00687 
00688 /**   
00689  * @} end of Conv group   
00690  */