Robert Lopez / CMSIS5
Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_conv_q7.c Source File

arm_conv_q7.c

00001 /* ----------------------------------------------------------------------
00002  * Project:      CMSIS DSP Library
00003  * Title:        arm_conv_q7.c
00004  * Description:  Convolution of Q7 sequences
00005  *
00006  * $Date:        27. January 2017
00007  * $Revision:    V.1.5.1
00008  *
00009  * Target Processor: Cortex-M cores
00010  * -------------------------------------------------------------------- */
00011 /*
00012  * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
00013  *
00014  * SPDX-License-Identifier: Apache-2.0
00015  *
00016  * Licensed under the Apache License, Version 2.0 (the License); you may
00017  * not use this file except in compliance with the License.
00018  * You may obtain a copy of the License at
00019  *
00020  * www.apache.org/licenses/LICENSE-2.0
00021  *
00022  * Unless required by applicable law or agreed to in writing, software
00023  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
00024  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00025  * See the License for the specific language governing permissions and
00026  * limitations under the License.
00027  */
00028 
00029 #include "arm_math.h"
00030 
00031 /**
00032  * @ingroup groupFilters
00033  */
00034 
00035 /**
00036  * @addtogroup Conv
00037  * @{
00038  */
00039 
00040 /**
00041  * @brief Convolution of Q7 sequences.
00042  * @param[in] *pSrcA points to the first input sequence.
00043  * @param[in] srcALen length of the first input sequence.
00044  * @param[in] *pSrcB points to the second input sequence.
00045  * @param[in] srcBLen length of the second input sequence.
00046  * @param[out] *pDst points to the location where the output result is written.  Length srcALen+srcBLen-1.
00047  * @return none.
00048  *
00049  * @details
00050  * <b>Scaling and Overflow Behavior:</b>
00051  *
00052  * \par
00053  * The function is implemented using a 32-bit internal accumulator.
00054  * Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result.
00055  * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
00056  * This approach provides 17 guard bits and there is no risk of overflow as long as <code>max(srcALen, srcBLen)<131072</code>.
00057  * The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and then saturated to 1.7 format.
00058  *
00059  * \par
00060  * Refer the function <code>arm_conv_opt_q7()</code> for a faster implementation of this function.
00061  *
00062  */
00063 
00064 void arm_conv_q7(
00065   q7_t * pSrcA,
00066   uint32_t srcALen,
00067   q7_t * pSrcB,
00068   uint32_t srcBLen,
00069   q7_t * pDst)
00070 {
00071 
00072 
00073 #if defined (ARM_MATH_DSP)
00074 
00075   /* Run the below code for Cortex-M4 and Cortex-M3 */
00076 
00077   q7_t *pIn1;                                    /* inputA pointer */
00078   q7_t *pIn2;                                    /* inputB pointer */
00079   q7_t *pOut = pDst;                             /* output pointer */
00080   q7_t *px;                                      /* Intermediate inputA pointer */
00081   q7_t *py;                                      /* Intermediate inputB pointer */
00082   q7_t *pSrc1, *pSrc2;                           /* Intermediate pointers */
00083   q7_t x0, x1, x2, x3, c0, c1;                   /* Temporary variables to hold state and coefficient values */
00084   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulator */
00085   q31_t input1, input2;                          /* Temporary input variables */
00086   q15_t in1, in2;                                /* Temporary input variables */
00087   uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3;     /* loop counter */
00088 
00089   /* The algorithm implementation is based on the lengths of the inputs. */
00090   /* srcB is always made to slide across srcA. */
00091   /* So srcBLen is always considered as shorter or equal to srcALen */
00092   if (srcALen >= srcBLen)
00093   {
00094     /* Initialization of inputA pointer */
00095     pIn1 = pSrcA;
00096 
00097     /* Initialization of inputB pointer */
00098     pIn2 = pSrcB;
00099   }
00100   else
00101   {
00102     /* Initialization of inputA pointer */
00103     pIn1 = pSrcB;
00104 
00105     /* Initialization of inputB pointer */
00106     pIn2 = pSrcA;
00107 
00108     /* srcBLen is always considered as shorter or equal to srcALen */
00109     j = srcBLen;
00110     srcBLen = srcALen;
00111     srcALen = j;
00112   }
00113 
00114   /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
00115   /* The function is internally
00116    * divided into three stages according to the number of multiplications that has to be
00117    * taken place between inputA samples and inputB samples. In the first stage of the
00118    * algorithm, the multiplications increase by one for every iteration.
00119    * In the second stage of the algorithm, srcBLen number of multiplications are done.
00120    * In the third stage of the algorithm, the multiplications decrease by one
00121    * for every iteration. */
00122 
00123   /* The algorithm is implemented in three stages.
00124      The loop counters of each stage is initiated here. */
00125   blockSize1 = srcBLen - 1U;
00126   blockSize2 = (srcALen - srcBLen) + 1U;
00127   blockSize3 = blockSize1;
00128 
00129   /* --------------------------
00130    * Initializations of stage1
00131    * -------------------------*/
00132 
00133   /* sum = x[0] * y[0]
00134    * sum = x[0] * y[1] + x[1] * y[0]
00135    * ....
00136    * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
00137    */
00138 
00139   /* In this stage the MAC operations are increased by 1 for every iteration.
00140      The count variable holds the number of MAC operations performed */
00141   count = 1U;
00142 
00143   /* Working pointer of inputA */
00144   px = pIn1;
00145 
00146   /* Working pointer of inputB */
00147   py = pIn2;
00148 
00149 
00150   /* ------------------------
00151    * Stage1 process
00152    * ----------------------*/
00153 
00154   /* The first stage starts here */
00155   while (blockSize1 > 0U)
00156   {
00157     /* Accumulator is made zero for every iteration */
00158     sum = 0;
00159 
00160     /* Apply loop unrolling and compute 4 MACs simultaneously. */
00161     k = count >> 2U;
00162 
00163     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00164      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00165     while (k > 0U)
00166     {
00167       /* x[0] , x[1] */
00168       in1 = (q15_t) * px++;
00169       in2 = (q15_t) * px++;
00170       input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
00171 
00172       /* y[srcBLen - 1] , y[srcBLen - 2] */
00173       in1 = (q15_t) * py--;
00174       in2 = (q15_t) * py--;
00175       input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
00176 
00177       /* x[0] * y[srcBLen - 1] */
00178       /* x[1] * y[srcBLen - 2] */
00179       sum = __SMLAD(input1, input2, sum);
00180 
00181       /* x[2] , x[3] */
00182       in1 = (q15_t) * px++;
00183       in2 = (q15_t) * px++;
00184       input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
00185 
00186       /* y[srcBLen - 3] , y[srcBLen - 4] */
00187       in1 = (q15_t) * py--;
00188       in2 = (q15_t) * py--;
00189       input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
00190 
00191       /* x[2] * y[srcBLen - 3] */
00192       /* x[3] * y[srcBLen - 4] */
00193       sum = __SMLAD(input1, input2, sum);
00194 
00195       /* Decrement the loop counter */
00196       k--;
00197     }
00198 
00199     /* If the count is not a multiple of 4, compute any remaining MACs here.
00200      ** No loop unrolling is used. */
00201     k = count % 0x4U;
00202 
00203     while (k > 0U)
00204     {
00205       /* Perform the multiply-accumulates */
00206       sum += ((q15_t) * px++ * *py--);
00207 
00208       /* Decrement the loop counter */
00209       k--;
00210     }
00211 
00212     /* Store the result in the accumulator in the destination buffer. */
00213     *pOut++ = (q7_t) (__SSAT(sum >> 7U, 8));
00214 
00215     /* Update the inputA and inputB pointers for next MAC calculation */
00216     py = pIn2 + count;
00217     px = pIn1;
00218 
00219     /* Increment the MAC count */
00220     count++;
00221 
00222     /* Decrement the loop counter */
00223     blockSize1--;
00224   }
00225 
00226   /* --------------------------
00227    * Initializations of stage2
00228    * ------------------------*/
00229 
00230   /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
00231    * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
00232    * ....
00233    * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
00234    */
00235 
00236   /* Working pointer of inputA */
00237   px = pIn1;
00238 
00239   /* Working pointer of inputB */
00240   pSrc2 = pIn2 + (srcBLen - 1U);
00241   py = pSrc2;
00242 
00243   /* count is index by which the pointer pIn1 to be incremented */
00244   count = 0U;
00245 
00246   /* -------------------
00247    * Stage2 process
00248    * ------------------*/
00249 
00250   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
00251    * So, to loop unroll over blockSize2,
00252    * srcBLen should be greater than or equal to 4 */
00253   if (srcBLen >= 4U)
00254   {
00255     /* Loop unroll over blockSize2, by 4 */
00256     blkCnt = blockSize2 >> 2U;
00257 
00258     while (blkCnt > 0U)
00259     {
00260       /* Set all accumulators to zero */
00261       acc0 = 0;
00262       acc1 = 0;
00263       acc2 = 0;
00264       acc3 = 0;
00265 
00266       /* read x[0], x[1], x[2] samples */
00267       x0 = *(px++);
00268       x1 = *(px++);
00269       x2 = *(px++);
00270 
00271       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00272       k = srcBLen >> 2U;
00273 
00274       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00275        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00276       do
00277       {
00278         /* Read y[srcBLen - 1] sample */
00279         c0 = *(py--);
00280         /* Read y[srcBLen - 2] sample */
00281         c1 = *(py--);
00282 
00283         /* Read x[3] sample */
00284         x3 = *(px++);
00285 
00286         /* x[0] and x[1] are packed */
00287         in1 = (q15_t) x0;
00288         in2 = (q15_t) x1;
00289 
00290         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
00291 
00292         /* y[srcBLen - 1]   and y[srcBLen - 2] are packed */
00293         in1 = (q15_t) c0;
00294         in2 = (q15_t) c1;
00295 
00296         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
00297 
00298         /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2]  */
00299         acc0 = __SMLAD(input1, input2, acc0);
00300 
00301         /* x[1] and x[2] are packed */
00302         in1 = (q15_t) x1;
00303         in2 = (q15_t) x2;
00304 
00305         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
00306 
00307         /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2]  */
00308         acc1 = __SMLAD(input1, input2, acc1);
00309 
00310         /* x[2] and x[3] are packed */
00311         in1 = (q15_t) x2;
00312         in2 = (q15_t) x3;
00313 
00314         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
00315 
00316         /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2]  */
00317         acc2 = __SMLAD(input1, input2, acc2);
00318 
00319         /* Read x[4] sample */
00320         x0 = *(px++);
00321 
00322         /* x[3] and x[4] are packed */
00323         in1 = (q15_t) x3;
00324         in2 = (q15_t) x0;
00325 
00326         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
00327 
00328         /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2]  */
00329         acc3 = __SMLAD(input1, input2, acc3);
00330 
00331         /* Read y[srcBLen - 3] sample */
00332         c0 = *(py--);
00333         /* Read y[srcBLen - 4] sample */
00334         c1 = *(py--);
00335 
00336         /* Read x[5] sample */
00337         x1 = *(px++);
00338 
00339         /* x[2] and x[3] are packed */
00340         in1 = (q15_t) x2;
00341         in2 = (q15_t) x3;
00342 
00343         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
00344 
00345         /* y[srcBLen - 3] and y[srcBLen - 4] are packed */
00346         in1 = (q15_t) c0;
00347         in2 = (q15_t) c1;
00348 
00349         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
00350 
00351         /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4]  */
00352         acc0 = __SMLAD(input1, input2, acc0);
00353 
00354         /* x[3] and x[4] are packed */
00355         in1 = (q15_t) x3;
00356         in2 = (q15_t) x0;
00357 
00358         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
00359 
00360         /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4]  */
00361         acc1 = __SMLAD(input1, input2, acc1);
00362 
00363         /* x[4] and x[5] are packed */
00364         in1 = (q15_t) x0;
00365         in2 = (q15_t) x1;
00366 
00367         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
00368 
00369         /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4]  */
00370         acc2 = __SMLAD(input1, input2, acc2);
00371 
00372         /* Read x[6] sample */
00373         x2 = *(px++);
00374 
00375         /* x[5] and x[6] are packed */
00376         in1 = (q15_t) x1;
00377         in2 = (q15_t) x2;
00378 
00379         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
00380 
00381         /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4]  */
00382         acc3 = __SMLAD(input1, input2, acc3);
00383 
00384       } while (--k);
00385 
00386       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
00387        ** No loop unrolling is used. */
00388       k = srcBLen % 0x4U;
00389 
00390       while (k > 0U)
00391       {
00392         /* Read y[srcBLen - 5] sample */
00393         c0 = *(py--);
00394 
00395         /* Read x[7] sample */
00396         x3 = *(px++);
00397 
00398         /* Perform the multiply-accumulates */
00399         /* acc0 +=  x[4] * y[srcBLen - 5] */
00400         acc0 += ((q15_t) x0 * c0);
00401         /* acc1 +=  x[5] * y[srcBLen - 5] */
00402         acc1 += ((q15_t) x1 * c0);
00403         /* acc2 +=  x[6] * y[srcBLen - 5] */
00404         acc2 += ((q15_t) x2 * c0);
00405         /* acc3 +=  x[7] * y[srcBLen - 5] */
00406         acc3 += ((q15_t) x3 * c0);
00407 
00408         /* Reuse the present samples for the next MAC */
00409         x0 = x1;
00410         x1 = x2;
00411         x2 = x3;
00412 
00413         /* Decrement the loop counter */
00414         k--;
00415       }
00416 
00417 
00418       /* Store the result in the accumulator in the destination buffer. */
00419       *pOut++ = (q7_t) (__SSAT(acc0 >> 7U, 8));
00420       *pOut++ = (q7_t) (__SSAT(acc1 >> 7U, 8));
00421       *pOut++ = (q7_t) (__SSAT(acc2 >> 7U, 8));
00422       *pOut++ = (q7_t) (__SSAT(acc3 >> 7U, 8));
00423 
00424       /* Increment the pointer pIn1 index, count by 4 */
00425       count += 4U;
00426 
00427       /* Update the inputA and inputB pointers for next MAC calculation */
00428       px = pIn1 + count;
00429       py = pSrc2;
00430 
00431       /* Decrement the loop counter */
00432       blkCnt--;
00433     }
00434 
00435     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
00436      ** No loop unrolling is used. */
00437     blkCnt = blockSize2 % 0x4U;
00438 
00439     while (blkCnt > 0U)
00440     {
00441       /* Accumulator is made zero for every iteration */
00442       sum = 0;
00443 
00444       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00445       k = srcBLen >> 2U;
00446 
00447       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00448        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00449       while (k > 0U)
00450       {
00451 
00452         /* Reading two inputs of SrcA buffer and packing */
00453         in1 = (q15_t) * px++;
00454         in2 = (q15_t) * px++;
00455         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
00456 
00457         /* Reading two inputs of SrcB buffer and packing */
00458         in1 = (q15_t) * py--;
00459         in2 = (q15_t) * py--;
00460         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
00461 
00462         /* Perform the multiply-accumulates */
00463         sum = __SMLAD(input1, input2, sum);
00464 
00465         /* Reading two inputs of SrcA buffer and packing */
00466         in1 = (q15_t) * px++;
00467         in2 = (q15_t) * px++;
00468         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
00469 
00470         /* Reading two inputs of SrcB buffer and packing */
00471         in1 = (q15_t) * py--;
00472         in2 = (q15_t) * py--;
00473         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
00474 
00475         /* Perform the multiply-accumulates */
00476         sum = __SMLAD(input1, input2, sum);
00477 
00478         /* Decrement the loop counter */
00479         k--;
00480       }
00481 
00482       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
00483        ** No loop unrolling is used. */
00484       k = srcBLen % 0x4U;
00485 
00486       while (k > 0U)
00487       {
00488         /* Perform the multiply-accumulates */
00489         sum += ((q15_t) * px++ * *py--);
00490 
00491         /* Decrement the loop counter */
00492         k--;
00493       }
00494 
00495       /* Store the result in the accumulator in the destination buffer. */
00496       *pOut++ = (q7_t) (__SSAT(sum >> 7U, 8));
00497 
00498       /* Increment the pointer pIn1 index, count by 1 */
00499       count++;
00500 
00501       /* Update the inputA and inputB pointers for next MAC calculation */
00502       px = pIn1 + count;
00503       py = pSrc2;
00504 
00505       /* Decrement the loop counter */
00506       blkCnt--;
00507     }
00508   }
00509   else
00510   {
00511     /* If the srcBLen is not a multiple of 4,
00512      * the blockSize2 loop cannot be unrolled by 4 */
00513     blkCnt = blockSize2;
00514 
00515     while (blkCnt > 0U)
00516     {
00517       /* Accumulator is made zero for every iteration */
00518       sum = 0;
00519 
00520       /* srcBLen number of MACS should be performed */
00521       k = srcBLen;
00522 
00523       while (k > 0U)
00524       {
00525         /* Perform the multiply-accumulate */
00526         sum += ((q15_t) * px++ * *py--);
00527 
00528         /* Decrement the loop counter */
00529         k--;
00530       }
00531 
00532       /* Store the result in the accumulator in the destination buffer. */
00533       *pOut++ = (q7_t) (__SSAT(sum >> 7U, 8));
00534 
00535       /* Increment the MAC count */
00536       count++;
00537 
00538       /* Update the inputA and inputB pointers for next MAC calculation */
00539       px = pIn1 + count;
00540       py = pSrc2;
00541 
00542       /* Decrement the loop counter */
00543       blkCnt--;
00544     }
00545   }
00546 
00547 
00548   /* --------------------------
00549    * Initializations of stage3
00550    * -------------------------*/
00551 
00552   /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
00553    * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
00554    * ....
00555    * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
00556    * sum +=  x[srcALen-1] * y[srcBLen-1]
00557    */
00558 
00559   /* In this stage the MAC operations are decreased by 1 for every iteration.
00560      The blockSize3 variable holds the number of MAC operations performed */
00561 
00562   /* Working pointer of inputA */
00563   pSrc1 = pIn1 + (srcALen - (srcBLen - 1U));
00564   px = pSrc1;
00565 
00566   /* Working pointer of inputB */
00567   pSrc2 = pIn2 + (srcBLen - 1U);
00568   py = pSrc2;
00569 
00570   /* -------------------
00571    * Stage3 process
00572    * ------------------*/
00573 
00574   while (blockSize3 > 0U)
00575   {
00576     /* Accumulator is made zero for every iteration */
00577     sum = 0;
00578 
00579     /* Apply loop unrolling and compute 4 MACs simultaneously. */
00580     k = blockSize3 >> 2U;
00581 
00582     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00583      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00584     while (k > 0U)
00585     {
00586       /* Reading two inputs, x[srcALen - srcBLen + 1] and x[srcALen - srcBLen + 2] of SrcA buffer and packing */
00587       in1 = (q15_t) * px++;
00588       in2 = (q15_t) * px++;
00589       input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
00590 
00591       /* Reading two inputs, y[srcBLen - 1] and y[srcBLen - 2] of SrcB buffer and packing */
00592       in1 = (q15_t) * py--;
00593       in2 = (q15_t) * py--;
00594       input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
00595 
00596       /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
00597       /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
00598       sum = __SMLAD(input1, input2, sum);
00599 
00600       /* Reading two inputs, x[srcALen - srcBLen + 3] and x[srcALen - srcBLen + 4] of SrcA buffer and packing */
00601       in1 = (q15_t) * px++;
00602       in2 = (q15_t) * px++;
00603       input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
00604 
00605       /* Reading two inputs, y[srcBLen - 3] and y[srcBLen - 4] of SrcB buffer and packing */
00606       in1 = (q15_t) * py--;
00607       in2 = (q15_t) * py--;
00608       input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
00609 
00610       /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
00611       /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
00612       sum = __SMLAD(input1, input2, sum);
00613 
00614       /* Decrement the loop counter */
00615       k--;
00616     }
00617 
00618     /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
00619      ** No loop unrolling is used. */
00620     k = blockSize3 % 0x4U;
00621 
00622     while (k > 0U)
00623     {
00624       /* Perform the multiply-accumulates */
00625       sum += ((q15_t) * px++ * *py--);
00626 
00627       /* Decrement the loop counter */
00628       k--;
00629     }
00630 
00631     /* Store the result in the accumulator in the destination buffer. */
00632     *pOut++ = (q7_t) (__SSAT(sum >> 7U, 8));
00633 
00634     /* Update the inputA and inputB pointers for next MAC calculation */
00635     px = ++pSrc1;
00636     py = pSrc2;
00637 
00638     /* Decrement the loop counter */
00639     blockSize3--;
00640   }
00641 
00642 #else
00643 
00644   /* Run the below code for Cortex-M0 */
00645 
00646   q7_t *pIn1 = pSrcA;                            /* input pointer */
00647   q7_t *pIn2 = pSrcB;                            /* coefficient pointer */
00648   q31_t sum;                                     /* Accumulator */
00649   uint32_t i, j;                                 /* loop counter */
00650 
00651   /* Loop to calculate output of convolution for output length number of times */
00652   for (i = 0; i < (srcALen + srcBLen - 1); i++)
00653   {
00654     /* Initialize sum with zero to carry on MAC operations */
00655     sum = 0;
00656 
00657     /* Loop to perform MAC operations according to convolution equation */
00658     for (j = 0; j <= i; j++)
00659     {
00660       /* Check the array limitations */
00661       if (((i - j) < srcBLen) && (j < srcALen))
00662       {
00663         /* z[i] += x[i-j] * y[j] */
00664         sum += (q15_t) pIn1[j] * (pIn2[i - j]);
00665       }
00666     }
00667 
00668     /* Store the output in the destination buffer */
00669     pDst[i] = (q7_t) __SSAT((sum >> 7U), 8U);
00670   }
00671 
00672 #endif /*   #if defined (ARM_MATH_DSP)        */
00673 
00674 }
00675 
00676 /**
00677  * @} end of Conv group
00678  */
00679