Aded CMSIS5 DSP and NN folder. Needs some work

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_conv_q15.c Source File

arm_conv_q15.c

00001 /* ----------------------------------------------------------------------
00002  * Project:      CMSIS DSP Library
00003  * Title:        arm_conv_q15.c
00004  * Description:  Convolution of Q15 sequences
00005  *
00006  * $Date:        27. January 2017
00007  * $Revision:    V.1.5.1
00008  *
00009  * Target Processor: Cortex-M cores
00010  * -------------------------------------------------------------------- */
00011 /*
00012  * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
00013  *
00014  * SPDX-License-Identifier: Apache-2.0
00015  *
00016  * Licensed under the Apache License, Version 2.0 (the License); you may
00017  * not use this file except in compliance with the License.
00018  * You may obtain a copy of the License at
00019  *
00020  * www.apache.org/licenses/LICENSE-2.0
00021  *
00022  * Unless required by applicable law or agreed to in writing, software
00023  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
00024  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00025  * See the License for the specific language governing permissions and
00026  * limitations under the License.
00027  */
00028 
00029 #include "arm_math.h"
00030 
00031 /**
00032  * @ingroup groupFilters
00033  */
00034 
00035 /**
00036  * @addtogroup Conv
00037  * @{
00038  */
00039 
00040 /**
00041  * @brief Convolution of Q15 sequences.
00042  * @param[in] *pSrcA points to the first input sequence.
00043  * @param[in] srcALen length of the first input sequence.
00044  * @param[in] *pSrcB points to the second input sequence.
00045  * @param[in] srcBLen length of the second input sequence.
00046  * @param[out] *pDst points to the location where the output result is written.  Length srcALen+srcBLen-1.
00047  * @return none.
00048  *
00049  * @details
00050  * <b>Scaling and Overflow Behavior:</b>
00051  *
00052  * \par
00053  * The function is implemented using a 64-bit internal accumulator.
00054  * Both inputs are in 1.15 format and multiplications yield a 2.30 result.
00055  * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
00056  * This approach provides 33 guard bits and there is no risk of overflow.
00057  * The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.
00058  *
00059  * \par
00060  * Refer to <code>arm_conv_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.
00061  *
00062  * \par
00063  * Refer the function <code>arm_conv_opt_q15()</code> for a faster implementation of this function using scratch buffers.
00064  *
00065  */
00066 
00067 void arm_conv_q15(
00068   q15_t * pSrcA,
00069   uint32_t srcALen,
00070   q15_t * pSrcB,
00071   uint32_t srcBLen,
00072   q15_t * pDst)
00073 {
00074 
00075 #if (defined(ARM_MATH_CM7) || defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE)
00076 
00077   /* Run the below code for Cortex-M4 and Cortex-M3 */
00078 
00079   q15_t *pIn1;                                   /* inputA pointer */
00080   q15_t *pIn2;                                   /* inputB pointer */
00081   q15_t *pOut = pDst;                            /* output pointer */
00082   q63_t sum, acc0, acc1, acc2, acc3;             /* Accumulator */
00083   q15_t *px;                                     /* Intermediate inputA pointer  */
00084   q15_t *py;                                     /* Intermediate inputB pointer  */
00085   q15_t *pSrc1, *pSrc2;                          /* Intermediate pointers */
00086   q31_t x0, x1, x2, x3, c0;                      /* Temporary variables to hold state and coefficient values */
00087   uint32_t blockSize1, blockSize2, blockSize3, j, k, count, blkCnt;     /* loop counter */
00088 
00089   /* The algorithm implementation is based on the lengths of the inputs. */
00090   /* srcB is always made to slide across srcA. */
00091   /* So srcBLen is always considered as shorter or equal to srcALen */
00092   if (srcALen >= srcBLen)
00093   {
00094     /* Initialization of inputA pointer */
00095     pIn1 = pSrcA;
00096 
00097     /* Initialization of inputB pointer */
00098     pIn2 = pSrcB;
00099   }
00100   else
00101   {
00102     /* Initialization of inputA pointer */
00103     pIn1 = pSrcB;
00104 
00105     /* Initialization of inputB pointer */
00106     pIn2 = pSrcA;
00107 
00108     /* srcBLen is always considered as shorter or equal to srcALen */
00109     j = srcBLen;
00110     srcBLen = srcALen;
00111     srcALen = j;
00112   }
00113 
00114   /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
00115   /* The function is internally
00116    * divided into three stages according to the number of multiplications that has to be
00117    * taken place between inputA samples and inputB samples. In the first stage of the
00118    * algorithm, the multiplications increase by one for every iteration.
00119    * In the second stage of the algorithm, srcBLen number of multiplications are done.
00120    * In the third stage of the algorithm, the multiplications decrease by one
00121    * for every iteration. */
00122 
00123   /* The algorithm is implemented in three stages.
00124      The loop counters of each stage is initiated here. */
00125   blockSize1 = srcBLen - 1U;
00126   blockSize2 = srcALen - (srcBLen - 1U);
00127 
00128   /* --------------------------
00129    * Initializations of stage1
00130    * -------------------------*/
00131 
00132   /* sum = x[0] * y[0]
00133    * sum = x[0] * y[1] + x[1] * y[0]
00134    * ....
00135    * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
00136    */
00137 
00138   /* In this stage the MAC operations are increased by 1 for every iteration.
00139      The count variable holds the number of MAC operations performed */
00140   count = 1U;
00141 
00142   /* Working pointer of inputA */
00143   px = pIn1;
00144 
00145   /* Working pointer of inputB */
00146   py = pIn2;
00147 
00148 
00149   /* ------------------------
00150    * Stage1 process
00151    * ----------------------*/
00152 
00153   /* For loop unrolling by 4, this stage is divided into two. */
00154   /* First part of this stage computes the MAC operations less than 4 */
00155   /* Second part of this stage computes the MAC operations greater than or equal to 4 */
00156 
00157   /* The first part of the stage starts here */
00158   while ((count < 4U) && (blockSize1 > 0U))
00159   {
00160     /* Accumulator is made zero for every iteration */
00161     sum = 0;
00162 
00163     /* Loop over number of MAC operations between
00164      * inputA samples and inputB samples */
00165     k = count;
00166 
00167     while (k > 0U)
00168     {
00169       /* Perform the multiply-accumulates */
00170       sum = __SMLALD(*px++, *py--, sum);
00171 
00172       /* Decrement the loop counter */
00173       k--;
00174     }
00175 
00176     /* Store the result in the accumulator in the destination buffer. */
00177     *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
00178 
00179     /* Update the inputA and inputB pointers for next MAC calculation */
00180     py = pIn2 + count;
00181     px = pIn1;
00182 
00183     /* Increment the MAC count */
00184     count++;
00185 
00186     /* Decrement the loop counter */
00187     blockSize1--;
00188   }
00189 
00190   /* The second part of the stage starts here */
00191   /* The internal loop, over count, is unrolled by 4 */
00192   /* To, read the last two inputB samples using SIMD:
00193    * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
00194   py = py - 1;
00195 
00196   while (blockSize1 > 0U)
00197   {
00198     /* Accumulator is made zero for every iteration */
00199     sum = 0;
00200 
00201     /* Apply loop unrolling and compute 4 MACs simultaneously. */
00202     k = count >> 2U;
00203 
00204     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00205      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00206     while (k > 0U)
00207     {
00208       /* Perform the multiply-accumulates */
00209       /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
00210       sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
00211       /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
00212       sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
00213 
00214       /* Decrement the loop counter */
00215       k--;
00216     }
00217 
00218     /* For the next MAC operations, the pointer py is used without SIMD
00219      * So, py is incremented by 1 */
00220     py = py + 1U;
00221 
00222     /* If the count is not a multiple of 4, compute any remaining MACs here.
00223      ** No loop unrolling is used. */
00224     k = count % 0x4U;
00225 
00226     while (k > 0U)
00227     {
00228       /* Perform the multiply-accumulates */
00229       sum = __SMLALD(*px++, *py--, sum);
00230 
00231       /* Decrement the loop counter */
00232       k--;
00233     }
00234 
00235     /* Store the result in the accumulator in the destination buffer. */
00236     *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
00237 
00238     /* Update the inputA and inputB pointers for next MAC calculation */
00239     py = pIn2 + (count - 1U);
00240     px = pIn1;
00241 
00242     /* Increment the MAC count */
00243     count++;
00244 
00245     /* Decrement the loop counter */
00246     blockSize1--;
00247   }
00248 
00249   /* --------------------------
00250    * Initializations of stage2
00251    * ------------------------*/
00252 
00253   /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
00254    * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
00255    * ....
00256    * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
00257    */
00258 
00259   /* Working pointer of inputA */
00260   px = pIn1;
00261 
00262   /* Working pointer of inputB */
00263   pSrc2 = pIn2 + (srcBLen - 1U);
00264   py = pSrc2;
00265 
00266   /* count is the index by which the pointer pIn1 to be incremented */
00267   count = 0U;
00268 
00269 
00270   /* --------------------
00271    * Stage2 process
00272    * -------------------*/
00273 
00274   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
00275    * So, to loop unroll over blockSize2,
00276    * srcBLen should be greater than or equal to 4 */
00277   if (srcBLen >= 4U)
00278   {
00279     /* Loop unroll over blockSize2, by 4 */
00280     blkCnt = blockSize2 >> 2U;
00281 
00282     while (blkCnt > 0U)
00283     {
00284       py = py - 1U;
00285 
00286       /* Set all accumulators to zero */
00287       acc0 = 0;
00288       acc1 = 0;
00289       acc2 = 0;
00290       acc3 = 0;
00291 
00292 
00293       /* read x[0], x[1] samples */
00294       x0 = *__SIMD32(px);
00295       /* read x[1], x[2] samples */
00296       x1 = _SIMD32_OFFSET(px+1);
00297       px+= 2U;
00298 
00299 
00300       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00301       k = srcBLen >> 2U;
00302 
00303       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00304        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00305       do
00306       {
00307         /* Read the last two inputB samples using SIMD:
00308          * y[srcBLen - 1] and y[srcBLen - 2] */
00309         c0 = *__SIMD32(py)--;
00310 
00311         /* acc0 +=  x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
00312         acc0 = __SMLALDX(x0, c0, acc0);
00313 
00314         /* acc1 +=  x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
00315         acc1 = __SMLALDX(x1, c0, acc1);
00316 
00317         /* Read x[2], x[3] */
00318         x2 = *__SIMD32(px);
00319 
00320         /* Read x[3], x[4] */
00321         x3 = _SIMD32_OFFSET(px+1);
00322 
00323         /* acc2 +=  x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
00324         acc2 = __SMLALDX(x2, c0, acc2);
00325 
00326         /* acc3 +=  x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
00327         acc3 = __SMLALDX(x3, c0, acc3);
00328 
00329         /* Read y[srcBLen - 3] and y[srcBLen - 4] */
00330         c0 = *__SIMD32(py)--;
00331 
00332         /* acc0 +=  x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
00333         acc0 = __SMLALDX(x2, c0, acc0);
00334 
00335         /* acc1 +=  x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
00336         acc1 = __SMLALDX(x3, c0, acc1);
00337 
00338         /* Read x[4], x[5] */
00339         x0 = _SIMD32_OFFSET(px+2);
00340 
00341         /* Read x[5], x[6] */
00342         x1 = _SIMD32_OFFSET(px+3);
00343         px += 4U;
00344 
00345         /* acc2 +=  x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
00346         acc2 = __SMLALDX(x0, c0, acc2);
00347 
00348         /* acc3 +=  x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
00349         acc3 = __SMLALDX(x1, c0, acc3);
00350 
00351       } while (--k);
00352 
00353       /* For the next MAC operations, SIMD is not used
00354        * So, the 16 bit pointer if inputB, py is updated */
00355 
00356       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
00357        ** No loop unrolling is used. */
00358       k = srcBLen % 0x4U;
00359 
00360       if (k == 1U)
00361       {
00362         /* Read y[srcBLen - 5] */
00363         c0 = *(py+1);
00364 
00365 #ifdef  ARM_MATH_BIG_ENDIAN
00366 
00367         c0 = c0 << 16U;
00368 
00369 #else
00370 
00371         c0 = c0 & 0x0000FFFF;
00372 
00373 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
00374         /* Read x[7] */
00375         x3 = *__SIMD32(px);
00376         px++;
00377 
00378         /* Perform the multiply-accumulates */
00379         acc0 = __SMLALD(x0, c0, acc0);
00380         acc1 = __SMLALD(x1, c0, acc1);
00381         acc2 = __SMLALDX(x1, c0, acc2);
00382         acc3 = __SMLALDX(x3, c0, acc3);
00383       }
00384 
00385       if (k == 2U)
00386       {
00387         /* Read y[srcBLen - 5], y[srcBLen - 6] */
00388         c0 = _SIMD32_OFFSET(py);
00389 
00390         /* Read x[7], x[8] */
00391         x3 = *__SIMD32(px);
00392 
00393         /* Read x[9] */
00394         x2 = _SIMD32_OFFSET(px+1);
00395         px += 2U;
00396 
00397         /* Perform the multiply-accumulates */
00398         acc0 = __SMLALDX(x0, c0, acc0);
00399         acc1 = __SMLALDX(x1, c0, acc1);
00400         acc2 = __SMLALDX(x3, c0, acc2);
00401         acc3 = __SMLALDX(x2, c0, acc3);
00402       }
00403 
00404       if (k == 3U)
00405       {
00406         /* Read y[srcBLen - 5], y[srcBLen - 6] */
00407         c0 = _SIMD32_OFFSET(py);
00408 
00409         /* Read x[7], x[8] */
00410         x3 = *__SIMD32(px);
00411 
00412         /* Read x[9] */
00413         x2 = _SIMD32_OFFSET(px+1);
00414 
00415         /* Perform the multiply-accumulates */
00416         acc0 = __SMLALDX(x0, c0, acc0);
00417         acc1 = __SMLALDX(x1, c0, acc1);
00418         acc2 = __SMLALDX(x3, c0, acc2);
00419         acc3 = __SMLALDX(x2, c0, acc3);
00420 
00421         c0 = *(py-1);
00422 
00423 #ifdef  ARM_MATH_BIG_ENDIAN
00424 
00425         c0 = c0 << 16U;
00426 #else
00427 
00428         c0 = c0 & 0x0000FFFF;
00429 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
00430         /* Read x[10] */
00431         x3 =  _SIMD32_OFFSET(px+2);
00432         px += 3U;
00433 
00434         /* Perform the multiply-accumulates */
00435         acc0 = __SMLALDX(x1, c0, acc0);
00436         acc1 = __SMLALD(x2, c0, acc1);
00437         acc2 = __SMLALDX(x2, c0, acc2);
00438         acc3 = __SMLALDX(x3, c0, acc3);
00439       }
00440 
00441 
00442       /* Store the results in the accumulators in the destination buffer. */
00443 
00444 #ifndef  ARM_MATH_BIG_ENDIAN
00445 
00446       *__SIMD32(pOut)++ =
00447         __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
00448       *__SIMD32(pOut)++ =
00449         __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
00450 
00451 #else
00452 
00453       *__SIMD32(pOut)++ =
00454         __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
00455       *__SIMD32(pOut)++ =
00456         __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
00457 
00458 #endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */
00459 
00460       /* Increment the pointer pIn1 index, count by 4 */
00461       count += 4U;
00462 
00463       /* Update the inputA and inputB pointers for next MAC calculation */
00464       px = pIn1 + count;
00465       py = pSrc2;
00466 
00467        /* Decrement the loop counter */
00468       blkCnt--;
00469     }
00470 
00471     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
00472      ** No loop unrolling is used. */
00473     blkCnt = blockSize2 % 0x4U;
00474 
00475     while (blkCnt > 0U)
00476     {
00477       /* Accumulator is made zero for every iteration */
00478       sum = 0;
00479 
00480       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00481       k = srcBLen >> 2U;
00482 
00483       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00484        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00485       while (k > 0U)
00486       {
00487         /* Perform the multiply-accumulates */
00488         sum += (q63_t) ((q31_t) * px++ * *py--);
00489         sum += (q63_t) ((q31_t) * px++ * *py--);
00490         sum += (q63_t) ((q31_t) * px++ * *py--);
00491         sum += (q63_t) ((q31_t) * px++ * *py--);
00492 
00493         /* Decrement the loop counter */
00494         k--;
00495       }
00496 
00497       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
00498        ** No loop unrolling is used. */
00499       k = srcBLen % 0x4U;
00500 
00501       while (k > 0U)
00502       {
00503         /* Perform the multiply-accumulates */
00504         sum += (q63_t) ((q31_t) * px++ * *py--);
00505 
00506         /* Decrement the loop counter */
00507         k--;
00508       }
00509 
00510       /* Store the result in the accumulator in the destination buffer. */
00511       *pOut++ = (q15_t) (__SSAT(sum >> 15, 16));
00512 
00513       /* Increment the pointer pIn1 index, count by 1 */
00514       count++;
00515 
00516       /* Update the inputA and inputB pointers for next MAC calculation */
00517       px = pIn1 + count;
00518       py = pSrc2;
00519 
00520       /* Decrement the loop counter */
00521       blkCnt--;
00522     }
00523   }
00524   else
00525   {
00526     /* If the srcBLen is not a multiple of 4,
00527      * the blockSize2 loop cannot be unrolled by 4 */
00528     blkCnt = blockSize2;
00529 
00530     while (blkCnt > 0U)
00531     {
00532       /* Accumulator is made zero for every iteration */
00533       sum = 0;
00534 
00535       /* srcBLen number of MACS should be performed */
00536       k = srcBLen;
00537 
00538       while (k > 0U)
00539       {
00540         /* Perform the multiply-accumulate */
00541         sum += (q63_t) ((q31_t) * px++ * *py--);
00542 
00543         /* Decrement the loop counter */
00544         k--;
00545       }
00546 
00547       /* Store the result in the accumulator in the destination buffer. */
00548       *pOut++ = (q15_t) (__SSAT(sum >> 15, 16));
00549 
00550       /* Increment the MAC count */
00551       count++;
00552 
00553       /* Update the inputA and inputB pointers for next MAC calculation */
00554       px = pIn1 + count;
00555       py = pSrc2;
00556 
00557       /* Decrement the loop counter */
00558       blkCnt--;
00559     }
00560   }
00561 
00562 
00563   /* --------------------------
00564    * Initializations of stage3
00565    * -------------------------*/
00566 
00567   /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
00568    * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
00569    * ....
00570    * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
00571    * sum +=  x[srcALen-1] * y[srcBLen-1]
00572    */
00573 
00574   /* In this stage the MAC operations are decreased by 1 for every iteration.
00575      The blockSize3 variable holds the number of MAC operations performed */
00576 
00577   blockSize3 = srcBLen - 1U;
00578 
00579   /* Working pointer of inputA */
00580   pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
00581   px = pSrc1;
00582 
00583   /* Working pointer of inputB */
00584   pSrc2 = pIn2 + (srcBLen - 1U);
00585   pIn2 = pSrc2 - 1U;
00586   py = pIn2;
00587 
00588   /* -------------------
00589    * Stage3 process
00590    * ------------------*/
00591 
00592   /* For loop unrolling by 4, this stage is divided into two. */
00593   /* First part of this stage computes the MAC operations greater than 4 */
00594   /* Second part of this stage computes the MAC operations less than or equal to 4 */
00595 
00596   /* The first part of the stage starts here */
00597   j = blockSize3 >> 2U;
00598 
00599   while ((j > 0U) && (blockSize3 > 0U))
00600   {
00601     /* Accumulator is made zero for every iteration */
00602     sum = 0;
00603 
00604     /* Apply loop unrolling and compute 4 MACs simultaneously. */
00605     k = blockSize3 >> 2U;
00606 
00607     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00608      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00609     while (k > 0U)
00610     {
00611       /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied
00612        * with y[srcBLen - 1], y[srcBLen - 2] respectively */
00613       sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
00614       /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied
00615        * with y[srcBLen - 3], y[srcBLen - 4] respectively */
00616       sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
00617 
00618       /* Decrement the loop counter */
00619       k--;
00620     }
00621 
00622     /* For the next MAC operations, the pointer py is used without SIMD
00623      * So, py is incremented by 1 */
00624     py = py + 1U;
00625 
00626     /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
00627      ** No loop unrolling is used. */
00628     k = blockSize3 % 0x4U;
00629 
00630     while (k > 0U)
00631     {
00632       /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
00633       sum = __SMLALD(*px++, *py--, sum);
00634 
00635       /* Decrement the loop counter */
00636       k--;
00637     }
00638 
00639     /* Store the result in the accumulator in the destination buffer. */
00640     *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
00641 
00642     /* Update the inputA and inputB pointers for next MAC calculation */
00643     px = ++pSrc1;
00644     py = pIn2;
00645 
00646     /* Decrement the loop counter */
00647     blockSize3--;
00648 
00649     j--;
00650   }
00651 
00652   /* The second part of the stage starts here */
00653   /* SIMD is not used for the next MAC operations,
00654    * so pointer py is updated to read only one sample at a time */
00655   py = py + 1U;
00656 
00657   while (blockSize3 > 0U)
00658   {
00659     /* Accumulator is made zero for every iteration */
00660     sum = 0;
00661 
00662     /* Apply loop unrolling and compute 4 MACs simultaneously. */
00663     k = blockSize3;
00664 
00665     while (k > 0U)
00666     {
00667       /* Perform the multiply-accumulates */
00668       /* sum +=  x[srcALen-1] * y[srcBLen-1] */
00669       sum = __SMLALD(*px++, *py--, sum);
00670 
00671       /* Decrement the loop counter */
00672       k--;
00673     }
00674 
00675     /* Store the result in the accumulator in the destination buffer. */
00676     *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
00677 
00678     /* Update the inputA and inputB pointers for next MAC calculation */
00679     px = ++pSrc1;
00680     py = pSrc2;
00681 
00682     /* Decrement the loop counter */
00683     blockSize3--;
00684   }
00685 
00686 #else
00687 
00688 /* Run the below code for Cortex-M0 */
00689 
00690   q15_t *pIn1 = pSrcA;                           /* input pointer */
00691   q15_t *pIn2 = pSrcB;                           /* coefficient pointer */
00692   q63_t sum;                                     /* Accumulator */
00693   uint32_t i, j;                                 /* loop counter */
00694 
00695   /* Loop to calculate output of convolution for output length number of times */
00696   for (i = 0; i < (srcALen + srcBLen - 1); i++)
00697   {
00698     /* Initialize sum with zero to carry on MAC operations */
00699     sum = 0;
00700 
00701     /* Loop to perform MAC operations according to convolution equation */
00702     for (j = 0; j <= i; j++)
00703     {
00704       /* Check the array limitations */
00705       if (((i - j) < srcBLen) && (j < srcALen))
00706       {
00707         /* z[i] += x[i-j] * y[j] */
00708         sum += (q31_t) pIn1[j] * (pIn2[i - j]);
00709       }
00710     }
00711 
00712     /* Store the output in the destination buffer */
00713     pDst[i] = (q15_t) __SSAT((sum >> 15U), 16U);
00714   }
00715 
00716 #endif /* #if (defined(ARM_MATH_CM7) || defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE) */
00717 
00718 }
00719 
00720 /**
00721  * @} end of Conv group
00722  */
00723