Robert Lopez / CMSIS5
Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_conv_fast_q15.c Source File

arm_conv_fast_q15.c

00001 /* ----------------------------------------------------------------------
00002  * Project:      CMSIS DSP Library
00003  * Title:        arm_conv_fast_q15.c
00004  * Description:  Fast Q15 Convolution
00005  *
00006  * $Date:        27. January 2017
00007  * $Revision:    V.1.5.1
00008  *
00009  * Target Processor: Cortex-M cores
00010  * -------------------------------------------------------------------- */
00011 /*
00012  * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
00013  *
00014  * SPDX-License-Identifier: Apache-2.0
00015  *
00016  * Licensed under the Apache License, Version 2.0 (the License); you may
00017  * not use this file except in compliance with the License.
00018  * You may obtain a copy of the License at
00019  *
00020  * www.apache.org/licenses/LICENSE-2.0
00021  *
00022  * Unless required by applicable law or agreed to in writing, software
00023  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
00024  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00025  * See the License for the specific language governing permissions and
00026  * limitations under the License.
00027  */
00028 
00029 #include "arm_math.h"
00030 
00031 /**
00032  * @ingroup groupFilters
00033  */
00034 
00035 /**
00036  * @addtogroup Conv
00037  * @{
00038  */
00039 
00040 /**
00041  * @brief Convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4.
00042  * @param[in] *pSrcA points to the first input sequence.
00043  * @param[in] srcALen length of the first input sequence.
00044  * @param[in] *pSrcB points to the second input sequence.
00045  * @param[in] srcBLen length of the second input sequence.
00046  * @param[out] *pDst points to the location where the output result is written.  Length srcALen+srcBLen-1.
00047  * @return none.
00048  *
00049  * <b>Scaling and Overflow Behavior:</b>
00050  *
00051  * \par
00052  * This fast version uses a 32-bit accumulator with 2.30 format.
00053  * The accumulator maintains full precision of the intermediate multiplication results
00054  * but provides only a single guard bit. There is no saturation on intermediate additions.
00055  * Thus, if the accumulator overflows it wraps around and distorts the result.
00056  * The input signals should be scaled down to avoid intermediate overflows.
00057  * Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows,
00058  * as maximum of min(srcALen, srcBLen) number of additions are carried internally.
00059  * The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result.
00060  *
00061  * \par
00062  * See <code>arm_conv_q15()</code> for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion.
00063  */
00064 
00065 void arm_conv_fast_q15(
00066   q15_t * pSrcA,
00067   uint32_t srcALen,
00068   q15_t * pSrcB,
00069   uint32_t srcBLen,
00070   q15_t * pDst)
00071 {
00072 #ifndef UNALIGNED_SUPPORT_DISABLE
00073   q15_t *pIn1;                                   /* inputA pointer */
00074   q15_t *pIn2;                                   /* inputB pointer */
00075   q15_t *pOut = pDst;                            /* output pointer */
00076   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulator */
00077   q15_t *px;                                     /* Intermediate inputA pointer  */
00078   q15_t *py;                                     /* Intermediate inputB pointer  */
00079   q15_t *pSrc1, *pSrc2;                          /* Intermediate pointers */
00080   q31_t x0, x1, x2, x3, c0;                      /* Temporary variables to hold state and coefficient values */
00081   uint32_t blockSize1, blockSize2, blockSize3, j, k, count, blkCnt;     /* loop counter */
00082 
00083   /* The algorithm implementation is based on the lengths of the inputs. */
00084   /* srcB is always made to slide across srcA. */
00085   /* So srcBLen is always considered as shorter or equal to srcALen */
00086   if (srcALen >= srcBLen)
00087   {
00088     /* Initialization of inputA pointer */
00089     pIn1 = pSrcA;
00090 
00091     /* Initialization of inputB pointer */
00092     pIn2 = pSrcB;
00093   }
00094   else
00095   {
00096     /* Initialization of inputA pointer */
00097     pIn1 = pSrcB;
00098 
00099     /* Initialization of inputB pointer */
00100     pIn2 = pSrcA;
00101 
00102     /* srcBLen is always considered as shorter or equal to srcALen */
00103     j = srcBLen;
00104     srcBLen = srcALen;
00105     srcALen = j;
00106   }
00107 
00108   /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
00109   /* The function is internally
00110    * divided into three stages according to the number of multiplications that has to be
00111    * taken place between inputA samples and inputB samples. In the first stage of the
00112    * algorithm, the multiplications increase by one for every iteration.
00113    * In the second stage of the algorithm, srcBLen number of multiplications are done.
00114    * In the third stage of the algorithm, the multiplications decrease by one
00115    * for every iteration. */
00116 
00117   /* The algorithm is implemented in three stages.
00118      The loop counters of each stage is initiated here. */
00119   blockSize1 = srcBLen - 1U;
00120   blockSize2 = srcALen - (srcBLen - 1U);
00121   blockSize3 = blockSize1;
00122 
00123   /* --------------------------
00124    * Initializations of stage1
00125    * -------------------------*/
00126 
00127   /* sum = x[0] * y[0]
00128    * sum = x[0] * y[1] + x[1] * y[0]
00129    * ....
00130    * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
00131    */
00132 
00133   /* In this stage the MAC operations are increased by 1 for every iteration.
00134      The count variable holds the number of MAC operations performed */
00135   count = 1U;
00136 
00137   /* Working pointer of inputA */
00138   px = pIn1;
00139 
00140   /* Working pointer of inputB */
00141   py = pIn2;
00142 
00143 
00144   /* ------------------------
00145    * Stage1 process
00146    * ----------------------*/
00147 
00148   /* For loop unrolling by 4, this stage is divided into two. */
00149   /* First part of this stage computes the MAC operations less than 4 */
00150   /* Second part of this stage computes the MAC operations greater than or equal to 4 */
00151 
00152   /* The first part of the stage starts here */
00153   while ((count < 4U) && (blockSize1 > 0U))
00154   {
00155     /* Accumulator is made zero for every iteration */
00156     sum = 0;
00157 
00158     /* Loop over number of MAC operations between
00159      * inputA samples and inputB samples */
00160     k = count;
00161 
00162     while (k > 0U)
00163     {
00164       /* Perform the multiply-accumulates */
00165       sum = __SMLAD(*px++, *py--, sum);
00166 
00167       /* Decrement the loop counter */
00168       k--;
00169     }
00170 
00171     /* Store the result in the accumulator in the destination buffer. */
00172     *pOut++ = (q15_t) (sum >> 15);
00173 
00174     /* Update the inputA and inputB pointers for next MAC calculation */
00175     py = pIn2 + count;
00176     px = pIn1;
00177 
00178     /* Increment the MAC count */
00179     count++;
00180 
00181     /* Decrement the loop counter */
00182     blockSize1--;
00183   }
00184 
00185   /* The second part of the stage starts here */
00186   /* The internal loop, over count, is unrolled by 4 */
00187   /* To, read the last two inputB samples using SIMD:
00188    * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
00189   py = py - 1;
00190 
00191   while (blockSize1 > 0U)
00192   {
00193     /* Accumulator is made zero for every iteration */
00194     sum = 0;
00195 
00196     /* Apply loop unrolling and compute 4 MACs simultaneously. */
00197     k = count >> 2U;
00198 
00199     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00200      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00201     while (k > 0U)
00202     {
00203       /* Perform the multiply-accumulates */
00204       /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
00205       sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
00206       /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
00207       sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
00208 
00209       /* Decrement the loop counter */
00210       k--;
00211     }
00212 
00213     /* For the next MAC operations, the pointer py is used without SIMD
00214      * So, py is incremented by 1 */
00215     py = py + 1U;
00216 
00217     /* If the count is not a multiple of 4, compute any remaining MACs here.
00218      ** No loop unrolling is used. */
00219     k = count % 0x4U;
00220 
00221     while (k > 0U)
00222     {
00223       /* Perform the multiply-accumulates */
00224       sum = __SMLAD(*px++, *py--, sum);
00225 
00226       /* Decrement the loop counter */
00227       k--;
00228     }
00229 
00230     /* Store the result in the accumulator in the destination buffer. */
00231     *pOut++ = (q15_t) (sum >> 15);
00232 
00233     /* Update the inputA and inputB pointers for next MAC calculation */
00234     py = pIn2 + (count - 1U);
00235     px = pIn1;
00236 
00237     /* Increment the MAC count */
00238     count++;
00239 
00240     /* Decrement the loop counter */
00241     blockSize1--;
00242   }
00243 
00244   /* --------------------------
00245    * Initializations of stage2
00246    * ------------------------*/
00247 
00248   /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
00249    * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
00250    * ....
00251    * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
00252    */
00253 
00254   /* Working pointer of inputA */
00255   px = pIn1;
00256 
00257   /* Working pointer of inputB */
00258   pSrc2 = pIn2 + (srcBLen - 1U);
00259   py = pSrc2;
00260 
00261   /* count is the index by which the pointer pIn1 to be incremented */
00262   count = 0U;
00263 
00264 
00265   /* --------------------
00266    * Stage2 process
00267    * -------------------*/
00268 
00269   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
00270    * So, to loop unroll over blockSize2,
00271    * srcBLen should be greater than or equal to 4 */
00272   if (srcBLen >= 4U)
00273   {
00274     /* Loop unroll over blockSize2, by 4 */
00275     blkCnt = blockSize2 >> 2U;
00276 
00277     while (blkCnt > 0U)
00278     {
00279       py = py - 1U;
00280 
00281       /* Set all accumulators to zero */
00282       acc0 = 0;
00283       acc1 = 0;
00284       acc2 = 0;
00285       acc3 = 0;
00286 
00287 
00288       /* read x[0], x[1] samples */
00289       x0 = *__SIMD32(px);
00290       /* read x[1], x[2] samples */
00291       x1 = _SIMD32_OFFSET(px+1);
00292       px+= 2U;
00293 
00294 
00295       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00296       k = srcBLen >> 2U;
00297 
00298       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00299        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00300       do
00301       {
00302         /* Read the last two inputB samples using SIMD:
00303          * y[srcBLen - 1] and y[srcBLen - 2] */
00304         c0 = *__SIMD32(py)--;
00305 
00306         /* acc0 +=  x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
00307         acc0 = __SMLADX(x0, c0, acc0);
00308 
00309         /* acc1 +=  x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
00310         acc1 = __SMLADX(x1, c0, acc1);
00311 
00312         /* Read x[2], x[3] */
00313         x2 = *__SIMD32(px);
00314 
00315         /* Read x[3], x[4] */
00316         x3 = _SIMD32_OFFSET(px+1);
00317 
00318         /* acc2 +=  x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
00319         acc2 = __SMLADX(x2, c0, acc2);
00320 
00321         /* acc3 +=  x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
00322         acc3 = __SMLADX(x3, c0, acc3);
00323 
00324         /* Read y[srcBLen - 3] and y[srcBLen - 4] */
00325         c0 = *__SIMD32(py)--;
00326 
00327         /* acc0 +=  x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
00328         acc0 = __SMLADX(x2, c0, acc0);
00329 
00330         /* acc1 +=  x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
00331         acc1 = __SMLADX(x3, c0, acc1);
00332 
00333         /* Read x[4], x[5] */
00334         x0 = _SIMD32_OFFSET(px+2);
00335 
00336         /* Read x[5], x[6] */
00337         x1 = _SIMD32_OFFSET(px+3);
00338         px += 4U;
00339 
00340         /* acc2 +=  x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
00341         acc2 = __SMLADX(x0, c0, acc2);
00342 
00343         /* acc3 +=  x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
00344         acc3 = __SMLADX(x1, c0, acc3);
00345 
00346       } while (--k);
00347 
00348       /* For the next MAC operations, SIMD is not used
00349        * So, the 16 bit pointer if inputB, py is updated */
00350 
00351       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
00352        ** No loop unrolling is used. */
00353       k = srcBLen % 0x4U;
00354 
00355       if (k == 1U)
00356       {
00357         /* Read y[srcBLen - 5] */
00358         c0 = *(py+1);
00359 
00360 #ifdef  ARM_MATH_BIG_ENDIAN
00361 
00362         c0 = c0 << 16U;
00363 
00364 #else
00365 
00366         c0 = c0 & 0x0000FFFF;
00367 
00368 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
00369 
00370         /* Read x[7] */
00371         x3 = *__SIMD32(px);
00372         px++;
00373 
00374         /* Perform the multiply-accumulates */
00375         acc0 = __SMLAD(x0, c0, acc0);
00376         acc1 = __SMLAD(x1, c0, acc1);
00377         acc2 = __SMLADX(x1, c0, acc2);
00378         acc3 = __SMLADX(x3, c0, acc3);
00379       }
00380 
00381       if (k == 2U)
00382       {
00383         /* Read y[srcBLen - 5], y[srcBLen - 6] */
00384         c0 = _SIMD32_OFFSET(py);
00385 
00386         /* Read x[7], x[8] */
00387         x3 = *__SIMD32(px);
00388 
00389         /* Read x[9] */
00390         x2 = _SIMD32_OFFSET(px+1);
00391         px += 2U;
00392 
00393         /* Perform the multiply-accumulates */
00394         acc0 = __SMLADX(x0, c0, acc0);
00395         acc1 = __SMLADX(x1, c0, acc1);
00396         acc2 = __SMLADX(x3, c0, acc2);
00397         acc3 = __SMLADX(x2, c0, acc3);
00398       }
00399 
00400       if (k == 3U)
00401       {
00402         /* Read y[srcBLen - 5], y[srcBLen - 6] */
00403         c0 = _SIMD32_OFFSET(py);
00404 
00405         /* Read x[7], x[8] */
00406         x3 = *__SIMD32(px);
00407 
00408         /* Read x[9] */
00409         x2 = _SIMD32_OFFSET(px+1);
00410 
00411         /* Perform the multiply-accumulates */
00412         acc0 = __SMLADX(x0, c0, acc0);
00413         acc1 = __SMLADX(x1, c0, acc1);
00414         acc2 = __SMLADX(x3, c0, acc2);
00415         acc3 = __SMLADX(x2, c0, acc3);
00416 
00417         /* Read y[srcBLen - 7] */
00418         c0 = *(py-1);
00419 #ifdef  ARM_MATH_BIG_ENDIAN
00420 
00421         c0 = c0 << 16U;
00422 #else
00423 
00424         c0 = c0 & 0x0000FFFF;
00425 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
00426 
00427         /* Read x[10] */
00428         x3 =  _SIMD32_OFFSET(px+2);
00429         px += 3U;
00430 
00431         /* Perform the multiply-accumulates */
00432         acc0 = __SMLADX(x1, c0, acc0);
00433         acc1 = __SMLAD(x2, c0, acc1);
00434         acc2 = __SMLADX(x2, c0, acc2);
00435         acc3 = __SMLADX(x3, c0, acc3);
00436       }
00437 
00438       /* Store the results in the accumulators in the destination buffer. */
00439 #ifndef ARM_MATH_BIG_ENDIAN
00440 
00441       *__SIMD32(pOut)++ = __PKHBT((acc0 >> 15), (acc1 >> 15), 16);
00442       *__SIMD32(pOut)++ = __PKHBT((acc2 >> 15), (acc3 >> 15), 16);
00443 
00444 #else
00445 
00446       *__SIMD32(pOut)++ = __PKHBT((acc1 >> 15), (acc0 >> 15), 16);
00447       *__SIMD32(pOut)++ = __PKHBT((acc3 >> 15), (acc2 >> 15), 16);
00448 
00449 #endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */
00450 
00451       /* Increment the pointer pIn1 index, count by 4 */
00452       count += 4U;
00453 
00454       /* Update the inputA and inputB pointers for next MAC calculation */
00455       px = pIn1 + count;
00456       py = pSrc2;
00457 
00458       /* Decrement the loop counter */
00459       blkCnt--;
00460     }
00461 
00462     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
00463      ** No loop unrolling is used. */
00464     blkCnt = blockSize2 % 0x4U;
00465 
00466     while (blkCnt > 0U)
00467     {
00468       /* Accumulator is made zero for every iteration */
00469       sum = 0;
00470 
00471       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00472       k = srcBLen >> 2U;
00473 
00474       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00475        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00476       while (k > 0U)
00477       {
00478         /* Perform the multiply-accumulates */
00479         sum += ((q31_t) * px++ * *py--);
00480         sum += ((q31_t) * px++ * *py--);
00481         sum += ((q31_t) * px++ * *py--);
00482         sum += ((q31_t) * px++ * *py--);
00483 
00484         /* Decrement the loop counter */
00485         k--;
00486       }
00487 
00488       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
00489        ** No loop unrolling is used. */
00490       k = srcBLen % 0x4U;
00491 
00492       while (k > 0U)
00493       {
00494         /* Perform the multiply-accumulates */
00495         sum += ((q31_t) * px++ * *py--);
00496 
00497         /* Decrement the loop counter */
00498         k--;
00499       }
00500 
00501       /* Store the result in the accumulator in the destination buffer. */
00502       *pOut++ = (q15_t) (sum >> 15);
00503 
00504       /* Increment the pointer pIn1 index, count by 1 */
00505       count++;
00506 
00507       /* Update the inputA and inputB pointers for next MAC calculation */
00508       px = pIn1 + count;
00509       py = pSrc2;
00510 
00511       /* Decrement the loop counter */
00512       blkCnt--;
00513     }
00514   }
00515   else
00516   {
00517     /* If the srcBLen is not a multiple of 4,
00518      * the blockSize2 loop cannot be unrolled by 4 */
00519     blkCnt = blockSize2;
00520 
00521     while (blkCnt > 0U)
00522     {
00523       /* Accumulator is made zero for every iteration */
00524       sum = 0;
00525 
00526       /* srcBLen number of MACS should be performed */
00527       k = srcBLen;
00528 
00529       while (k > 0U)
00530       {
00531         /* Perform the multiply-accumulate */
00532         sum += ((q31_t) * px++ * *py--);
00533 
00534         /* Decrement the loop counter */
00535         k--;
00536       }
00537 
00538       /* Store the result in the accumulator in the destination buffer. */
00539       *pOut++ = (q15_t) (sum >> 15);
00540 
00541       /* Increment the MAC count */
00542       count++;
00543 
00544       /* Update the inputA and inputB pointers for next MAC calculation */
00545       px = pIn1 + count;
00546       py = pSrc2;
00547 
00548       /* Decrement the loop counter */
00549       blkCnt--;
00550     }
00551   }
00552 
00553 
00554   /* --------------------------
00555    * Initializations of stage3
00556    * -------------------------*/
00557 
00558   /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
00559    * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
00560    * ....
00561    * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
00562    * sum +=  x[srcALen-1] * y[srcBLen-1]
00563    */
00564 
00565   /* In this stage the MAC operations are decreased by 1 for every iteration.
00566      The blockSize3 variable holds the number of MAC operations performed */
00567 
00568   /* Working pointer of inputA */
00569   pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
00570   px = pSrc1;
00571 
00572   /* Working pointer of inputB */
00573   pSrc2 = pIn2 + (srcBLen - 1U);
00574   pIn2 = pSrc2 - 1U;
00575   py = pIn2;
00576 
00577   /* -------------------
00578    * Stage3 process
00579    * ------------------*/
00580 
00581   /* For loop unrolling by 4, this stage is divided into two. */
00582   /* First part of this stage computes the MAC operations greater than 4 */
00583   /* Second part of this stage computes the MAC operations less than or equal to 4 */
00584 
00585   /* The first part of the stage starts here */
00586   j = blockSize3 >> 2U;
00587 
00588   while ((j > 0U) && (blockSize3 > 0U))
00589   {
00590     /* Accumulator is made zero for every iteration */
00591     sum = 0;
00592 
00593     /* Apply loop unrolling and compute 4 MACs simultaneously. */
00594     k = blockSize3 >> 2U;
00595 
00596     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00597      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00598     while (k > 0U)
00599     {
00600       /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied
00601        * with y[srcBLen - 1], y[srcBLen - 2] respectively */
00602       sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
00603       /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied
00604        * with y[srcBLen - 3], y[srcBLen - 4] respectively */
00605       sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
00606 
00607       /* Decrement the loop counter */
00608       k--;
00609     }
00610 
00611     /* For the next MAC operations, the pointer py is used without SIMD
00612      * So, py is incremented by 1 */
00613     py = py + 1U;
00614 
00615     /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
00616      ** No loop unrolling is used. */
00617     k = blockSize3 % 0x4U;
00618 
00619     while (k > 0U)
00620     {
00621       /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
00622       sum = __SMLAD(*px++, *py--, sum);
00623 
00624       /* Decrement the loop counter */
00625       k--;
00626     }
00627 
00628     /* Store the result in the accumulator in the destination buffer. */
00629     *pOut++ = (q15_t) (sum >> 15);
00630 
00631     /* Update the inputA and inputB pointers for next MAC calculation */
00632     px = ++pSrc1;
00633     py = pIn2;
00634 
00635     /* Decrement the loop counter */
00636     blockSize3--;
00637 
00638     j--;
00639   }
00640 
00641   /* The second part of the stage starts here */
00642   /* SIMD is not used for the next MAC operations,
00643    * so pointer py is updated to read only one sample at a time */
00644   py = py + 1U;
00645 
00646   while (blockSize3 > 0U)
00647   {
00648     /* Accumulator is made zero for every iteration */
00649     sum = 0;
00650 
00651     /* Apply loop unrolling and compute 4 MACs simultaneously. */
00652     k = blockSize3;
00653 
00654     while (k > 0U)
00655     {
00656       /* Perform the multiply-accumulates */
00657       /* sum +=  x[srcALen-1] * y[srcBLen-1] */
00658       sum = __SMLAD(*px++, *py--, sum);
00659 
00660       /* Decrement the loop counter */
00661       k--;
00662     }
00663 
00664     /* Store the result in the accumulator in the destination buffer. */
00665     *pOut++ = (q15_t) (sum >> 15);
00666 
00667     /* Update the inputA and inputB pointers for next MAC calculation */
00668     px = ++pSrc1;
00669     py = pSrc2;
00670 
00671     /* Decrement the loop counter */
00672     blockSize3--;
00673   }
00674 
00675 #else
00676   q15_t *pIn1;                                   /* inputA pointer */
00677   q15_t *pIn2;                                   /* inputB pointer */
00678   q15_t *pOut = pDst;                            /* output pointer */
00679   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulator */
00680   q15_t *px;                                     /* Intermediate inputA pointer  */
00681   q15_t *py;                                     /* Intermediate inputB pointer  */
00682   q15_t *pSrc1, *pSrc2;                          /* Intermediate pointers */
00683   q31_t x0, x1, x2, x3, c0;                      /* Temporary variables to hold state and coefficient values */
00684   uint32_t blockSize1, blockSize2, blockSize3, j, k, count, blkCnt;     /* loop counter */
00685   q15_t a, b;
00686 
00687   /* The algorithm implementation is based on the lengths of the inputs. */
00688   /* srcB is always made to slide across srcA. */
00689   /* So srcBLen is always considered as shorter or equal to srcALen */
00690   if (srcALen >= srcBLen)
00691   {
00692     /* Initialization of inputA pointer */
00693     pIn1 = pSrcA;
00694 
00695     /* Initialization of inputB pointer */
00696     pIn2 = pSrcB;
00697   }
00698   else
00699   {
00700     /* Initialization of inputA pointer */
00701     pIn1 = pSrcB;
00702 
00703     /* Initialization of inputB pointer */
00704     pIn2 = pSrcA;
00705 
00706     /* srcBLen is always considered as shorter or equal to srcALen */
00707     j = srcBLen;
00708     srcBLen = srcALen;
00709     srcALen = j;
00710   }
00711 
00712   /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
00713   /* The function is internally
00714    * divided into three stages according to the number of multiplications that has to be
00715    * taken place between inputA samples and inputB samples. In the first stage of the
00716    * algorithm, the multiplications increase by one for every iteration.
00717    * In the second stage of the algorithm, srcBLen number of multiplications are done.
00718    * In the third stage of the algorithm, the multiplications decrease by one
00719    * for every iteration. */
00720 
00721   /* The algorithm is implemented in three stages.
00722      The loop counters of each stage is initiated here. */
00723   blockSize1 = srcBLen - 1U;
00724   blockSize2 = srcALen - (srcBLen - 1U);
00725   blockSize3 = blockSize1;
00726 
00727   /* --------------------------
00728    * Initializations of stage1
00729    * -------------------------*/
00730 
00731   /* sum = x[0] * y[0]
00732    * sum = x[0] * y[1] + x[1] * y[0]
00733    * ....
00734    * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
00735    */
00736 
00737   /* In this stage the MAC operations are increased by 1 for every iteration.
00738      The count variable holds the number of MAC operations performed */
00739   count = 1U;
00740 
00741   /* Working pointer of inputA */
00742   px = pIn1;
00743 
00744   /* Working pointer of inputB */
00745   py = pIn2;
00746 
00747 
00748   /* ------------------------
00749    * Stage1 process
00750    * ----------------------*/
00751 
00752   /* For loop unrolling by 4, this stage is divided into two. */
00753   /* First part of this stage computes the MAC operations less than 4 */
00754   /* Second part of this stage computes the MAC operations greater than or equal to 4 */
00755 
00756   /* The first part of the stage starts here */
00757   while ((count < 4U) && (blockSize1 > 0U))
00758   {
00759     /* Accumulator is made zero for every iteration */
00760     sum = 0;
00761 
00762     /* Loop over number of MAC operations between
00763      * inputA samples and inputB samples */
00764     k = count;
00765 
00766     while (k > 0U)
00767     {
00768       /* Perform the multiply-accumulates */
00769       sum += ((q31_t) * px++ * *py--);
00770 
00771       /* Decrement the loop counter */
00772       k--;
00773     }
00774 
00775     /* Store the result in the accumulator in the destination buffer. */
00776     *pOut++ = (q15_t) (sum >> 15);
00777 
00778     /* Update the inputA and inputB pointers for next MAC calculation */
00779     py = pIn2 + count;
00780     px = pIn1;
00781 
00782     /* Increment the MAC count */
00783     count++;
00784 
00785     /* Decrement the loop counter */
00786     blockSize1--;
00787   }
00788 
00789   /* The second part of the stage starts here */
00790   /* The internal loop, over count, is unrolled by 4 */
00791   /* To, read the last two inputB samples using SIMD:
00792    * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
00793   py = py - 1;
00794 
00795   while (blockSize1 > 0U)
00796   {
00797     /* Accumulator is made zero for every iteration */
00798     sum = 0;
00799 
00800     /* Apply loop unrolling and compute 4 MACs simultaneously. */
00801     k = count >> 2U;
00802 
00803     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00804      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00805     py++;
00806 
00807     while (k > 0U)
00808     {
00809       /* Perform the multiply-accumulates */
00810         sum += ((q31_t) * px++ * *py--);
00811         sum += ((q31_t) * px++ * *py--);
00812         sum += ((q31_t) * px++ * *py--);
00813         sum += ((q31_t) * px++ * *py--);
00814 
00815       /* Decrement the loop counter */
00816       k--;
00817     }
00818 
00819     /* If the count is not a multiple of 4, compute any remaining MACs here.
00820      ** No loop unrolling is used. */
00821     k = count % 0x4U;
00822 
00823     while (k > 0U)
00824     {
00825       /* Perform the multiply-accumulates */
00826       sum += ((q31_t) * px++ * *py--);
00827 
00828       /* Decrement the loop counter */
00829       k--;
00830     }
00831 
00832     /* Store the result in the accumulator in the destination buffer. */
00833     *pOut++ = (q15_t) (sum >> 15);
00834 
00835     /* Update the inputA and inputB pointers for next MAC calculation */
00836     py = pIn2 + (count - 1U);
00837     px = pIn1;
00838 
00839     /* Increment the MAC count */
00840     count++;
00841 
00842     /* Decrement the loop counter */
00843     blockSize1--;
00844   }
00845 
00846   /* --------------------------
00847    * Initializations of stage2
00848    * ------------------------*/
00849 
00850   /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
00851    * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
00852    * ....
00853    * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
00854    */
00855 
00856   /* Working pointer of inputA */
00857   px = pIn1;
00858 
00859   /* Working pointer of inputB */
00860   pSrc2 = pIn2 + (srcBLen - 1U);
00861   py = pSrc2;
00862 
00863   /* count is the index by which the pointer pIn1 to be incremented */
00864   count = 0U;
00865 
00866 
00867   /* --------------------
00868    * Stage2 process
00869    * -------------------*/
00870 
00871   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
00872    * So, to loop unroll over blockSize2,
00873    * srcBLen should be greater than or equal to 4 */
00874   if (srcBLen >= 4U)
00875   {
00876     /* Loop unroll over blockSize2, by 4 */
00877     blkCnt = blockSize2 >> 2U;
00878 
00879     while (blkCnt > 0U)
00880     {
00881       py = py - 1U;
00882 
00883       /* Set all accumulators to zero */
00884       acc0 = 0;
00885       acc1 = 0;
00886       acc2 = 0;
00887       acc3 = 0;
00888 
00889       /* read x[0], x[1] samples */
00890       a = *px++;
00891       b = *px++;
00892 
00893 #ifndef ARM_MATH_BIG_ENDIAN
00894 
00895       x0 = __PKHBT(a, b, 16);
00896       a = *px;
00897       x1 = __PKHBT(b, a, 16);
00898 
00899 #else
00900 
00901       x0 = __PKHBT(b, a, 16);
00902       a = *px;
00903       x1 = __PKHBT(a, b, 16);
00904 
00905 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN    */
00906 
00907       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00908       k = srcBLen >> 2U;
00909 
00910       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00911        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00912       do
00913       {
00914         /* Read the last two inputB samples using SIMD:
00915          * y[srcBLen - 1] and y[srcBLen - 2] */
00916         a = *py;
00917         b = *(py+1);
00918         py -= 2;
00919 
00920 #ifndef ARM_MATH_BIG_ENDIAN
00921 
00922         c0 = __PKHBT(a, b, 16);
00923 
00924 #else
00925 
00926         c0 = __PKHBT(b, a, 16);;
00927 
00928 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN */
00929 
00930         /* acc0 +=  x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
00931         acc0 = __SMLADX(x0, c0, acc0);
00932 
00933         /* acc1 +=  x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
00934         acc1 = __SMLADX(x1, c0, acc1);
00935 
00936       a = *px;
00937       b = *(px + 1);
00938 
00939 #ifndef ARM_MATH_BIG_ENDIAN
00940 
00941       x2 = __PKHBT(a, b, 16);
00942       a = *(px + 2);
00943       x3 = __PKHBT(b, a, 16);
00944 
00945 #else
00946 
00947       x2 = __PKHBT(b, a, 16);
00948       a = *(px + 2);
00949       x3 = __PKHBT(a, b, 16);
00950 
00951 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN    */
00952 
00953         /* acc2 +=  x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
00954         acc2 = __SMLADX(x2, c0, acc2);
00955 
00956         /* acc3 +=  x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
00957         acc3 = __SMLADX(x3, c0, acc3);
00958 
00959         /* Read y[srcBLen - 3] and y[srcBLen - 4] */
00960         a = *py;
00961         b = *(py+1);
00962         py -= 2;
00963 
00964 #ifndef ARM_MATH_BIG_ENDIAN
00965 
00966         c0 = __PKHBT(a, b, 16);
00967 
00968 #else
00969 
00970         c0 = __PKHBT(b, a, 16);;
00971 
00972 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN */
00973 
00974         /* acc0 +=  x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
00975         acc0 = __SMLADX(x2, c0, acc0);
00976 
00977         /* acc1 +=  x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
00978         acc1 = __SMLADX(x3, c0, acc1);
00979 
00980         /* Read x[4], x[5], x[6] */
00981       a = *(px + 2);
00982       b = *(px + 3);
00983 
00984 #ifndef ARM_MATH_BIG_ENDIAN
00985 
00986       x0 = __PKHBT(a, b, 16);
00987       a = *(px + 4);
00988       x1 = __PKHBT(b, a, 16);
00989 
00990 #else
00991 
00992       x0 = __PKHBT(b, a, 16);
00993       a = *(px + 4);
00994       x1 = __PKHBT(a, b, 16);
00995 
00996 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN    */
00997 
00998         px += 4U;
00999 
01000         /* acc2 +=  x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
01001         acc2 = __SMLADX(x0, c0, acc2);
01002 
01003         /* acc3 +=  x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
01004         acc3 = __SMLADX(x1, c0, acc3);
01005 
01006       } while (--k);
01007 
01008       /* For the next MAC operations, SIMD is not used
01009        * So, the 16 bit pointer if inputB, py is updated */
01010 
01011       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
01012        ** No loop unrolling is used. */
01013       k = srcBLen % 0x4U;
01014 
01015       if (k == 1U)
01016       {
01017         /* Read y[srcBLen - 5] */
01018         c0 = *(py+1);
01019 
01020 #ifdef  ARM_MATH_BIG_ENDIAN
01021 
01022         c0 = c0 << 16U;
01023 
01024 #else
01025 
01026         c0 = c0 & 0x0000FFFF;
01027 
01028 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
01029 
01030         /* Read x[7] */
01031         a = *px;
01032         b = *(px+1);
01033         px++;
01034 
01035 #ifndef ARM_MATH_BIG_ENDIAN
01036 
01037         x3 = __PKHBT(a, b, 16);
01038 
01039 #else
01040 
01041         x3 = __PKHBT(b, a, 16);;
01042 
01043 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN */
01044 
01045 
01046         /* Perform the multiply-accumulates */
01047         acc0 = __SMLAD(x0, c0, acc0);
01048         acc1 = __SMLAD(x1, c0, acc1);
01049         acc2 = __SMLADX(x1, c0, acc2);
01050         acc3 = __SMLADX(x3, c0, acc3);
01051       }
01052 
01053       if (k == 2U)
01054       {
01055         /* Read y[srcBLen - 5], y[srcBLen - 6] */
01056         a = *py;
01057         b = *(py+1);
01058 
01059 #ifndef ARM_MATH_BIG_ENDIAN
01060 
01061         c0 = __PKHBT(a, b, 16);
01062 
01063 #else
01064 
01065         c0 = __PKHBT(b, a, 16);;
01066 
01067 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN */
01068 
01069         /* Read x[7], x[8], x[9] */
01070       a = *px;
01071       b = *(px + 1);
01072 
01073 #ifndef ARM_MATH_BIG_ENDIAN
01074 
01075       x3 = __PKHBT(a, b, 16);
01076       a = *(px + 2);
01077       x2 = __PKHBT(b, a, 16);
01078 
01079 #else
01080 
01081       x3 = __PKHBT(b, a, 16);
01082       a = *(px + 2);
01083       x2 = __PKHBT(a, b, 16);
01084 
01085 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN    */
01086         px += 2U;
01087 
01088         /* Perform the multiply-accumulates */
01089         acc0 = __SMLADX(x0, c0, acc0);
01090         acc1 = __SMLADX(x1, c0, acc1);
01091         acc2 = __SMLADX(x3, c0, acc2);
01092         acc3 = __SMLADX(x2, c0, acc3);
01093       }
01094 
01095       if (k == 3U)
01096       {
01097         /* Read y[srcBLen - 5], y[srcBLen - 6] */
01098         a = *py;
01099         b = *(py+1);
01100 
01101 #ifndef ARM_MATH_BIG_ENDIAN
01102 
01103         c0 = __PKHBT(a, b, 16);
01104 
01105 #else
01106 
01107         c0 = __PKHBT(b, a, 16);;
01108 
01109 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN */
01110 
01111         /* Read x[7], x[8], x[9] */
01112       a = *px;
01113       b = *(px + 1);
01114 
01115 #ifndef ARM_MATH_BIG_ENDIAN
01116 
01117       x3 = __PKHBT(a, b, 16);
01118       a = *(px + 2);
01119       x2 = __PKHBT(b, a, 16);
01120 
01121 #else
01122 
01123       x3 = __PKHBT(b, a, 16);
01124       a = *(px + 2);
01125       x2 = __PKHBT(a, b, 16);
01126 
01127 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN    */
01128 
01129         /* Perform the multiply-accumulates */
01130         acc0 = __SMLADX(x0, c0, acc0);
01131         acc1 = __SMLADX(x1, c0, acc1);
01132         acc2 = __SMLADX(x3, c0, acc2);
01133         acc3 = __SMLADX(x2, c0, acc3);
01134 
01135         /* Read y[srcBLen - 7] */
01136         c0 = *(py-1);
01137 #ifdef  ARM_MATH_BIG_ENDIAN
01138 
01139         c0 = c0 << 16U;
01140 #else
01141 
01142         c0 = c0 & 0x0000FFFF;
01143 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
01144 
01145         /* Read x[10] */
01146         a = *(px+2);
01147         b = *(px+3);
01148 
01149 #ifndef ARM_MATH_BIG_ENDIAN
01150 
01151         x3 = __PKHBT(a, b, 16);
01152 
01153 #else
01154 
01155         x3 = __PKHBT(b, a, 16);;
01156 
01157 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN */
01158 
01159         px += 3U;
01160 
01161         /* Perform the multiply-accumulates */
01162         acc0 = __SMLADX(x1, c0, acc0);
01163         acc1 = __SMLAD(x2, c0, acc1);
01164         acc2 = __SMLADX(x2, c0, acc2);
01165         acc3 = __SMLADX(x3, c0, acc3);
01166       }
01167 
01168       /* Store the results in the accumulators in the destination buffer. */
01169       *pOut++ = (q15_t)(acc0 >> 15);
01170       *pOut++ = (q15_t)(acc1 >> 15);
01171       *pOut++ = (q15_t)(acc2 >> 15);
01172       *pOut++ = (q15_t)(acc3 >> 15);
01173 
01174       /* Increment the pointer pIn1 index, count by 4 */
01175       count += 4U;
01176 
01177       /* Update the inputA and inputB pointers for next MAC calculation */
01178       px = pIn1 + count;
01179       py = pSrc2;
01180 
01181       /* Decrement the loop counter */
01182       blkCnt--;
01183     }
01184 
01185     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
01186      ** No loop unrolling is used. */
01187     blkCnt = blockSize2 % 0x4U;
01188 
01189     while (blkCnt > 0U)
01190     {
01191       /* Accumulator is made zero for every iteration */
01192       sum = 0;
01193 
01194       /* Apply loop unrolling and compute 4 MACs simultaneously. */
01195       k = srcBLen >> 2U;
01196 
01197       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
01198        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
01199       while (k > 0U)
01200       {
01201         /* Perform the multiply-accumulates */
01202         sum += ((q31_t) * px++ * *py--);
01203         sum += ((q31_t) * px++ * *py--);
01204         sum += ((q31_t) * px++ * *py--);
01205         sum += ((q31_t) * px++ * *py--);
01206 
01207         /* Decrement the loop counter */
01208         k--;
01209       }
01210 
01211       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
01212        ** No loop unrolling is used. */
01213       k = srcBLen % 0x4U;
01214 
01215       while (k > 0U)
01216       {
01217         /* Perform the multiply-accumulates */
01218         sum += ((q31_t) * px++ * *py--);
01219 
01220         /* Decrement the loop counter */
01221         k--;
01222       }
01223 
01224       /* Store the result in the accumulator in the destination buffer. */
01225       *pOut++ = (q15_t) (sum >> 15);
01226 
01227       /* Increment the pointer pIn1 index, count by 1 */
01228       count++;
01229 
01230       /* Update the inputA and inputB pointers for next MAC calculation */
01231       px = pIn1 + count;
01232       py = pSrc2;
01233 
01234       /* Decrement the loop counter */
01235       blkCnt--;
01236     }
01237   }
01238   else
01239   {
01240     /* If the srcBLen is not a multiple of 4,
01241      * the blockSize2 loop cannot be unrolled by 4 */
01242     blkCnt = blockSize2;
01243 
01244     while (blkCnt > 0U)
01245     {
01246       /* Accumulator is made zero for every iteration */
01247       sum = 0;
01248 
01249       /* srcBLen number of MACS should be performed */
01250       k = srcBLen;
01251 
01252       while (k > 0U)
01253       {
01254         /* Perform the multiply-accumulate */
01255         sum += ((q31_t) * px++ * *py--);
01256 
01257         /* Decrement the loop counter */
01258         k--;
01259       }
01260 
01261       /* Store the result in the accumulator in the destination buffer. */
01262       *pOut++ = (q15_t) (sum >> 15);
01263 
01264       /* Increment the MAC count */
01265       count++;
01266 
01267       /* Update the inputA and inputB pointers for next MAC calculation */
01268       px = pIn1 + count;
01269       py = pSrc2;
01270 
01271       /* Decrement the loop counter */
01272       blkCnt--;
01273     }
01274   }
01275 
01276 
01277   /* --------------------------
01278    * Initializations of stage3
01279    * -------------------------*/
01280 
01281   /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
01282    * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
01283    * ....
01284    * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
01285    * sum +=  x[srcALen-1] * y[srcBLen-1]
01286    */
01287 
01288   /* In this stage the MAC operations are decreased by 1 for every iteration.
01289      The blockSize3 variable holds the number of MAC operations performed */
01290 
01291   /* Working pointer of inputA */
01292   pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
01293   px = pSrc1;
01294 
01295   /* Working pointer of inputB */
01296   pSrc2 = pIn2 + (srcBLen - 1U);
01297   pIn2 = pSrc2 - 1U;
01298   py = pIn2;
01299 
01300   /* -------------------
01301    * Stage3 process
01302    * ------------------*/
01303 
01304   /* For loop unrolling by 4, this stage is divided into two. */
01305   /* First part of this stage computes the MAC operations greater than 4 */
01306   /* Second part of this stage computes the MAC operations less than or equal to 4 */
01307 
01308   /* The first part of the stage starts here */
01309   j = blockSize3 >> 2U;
01310 
01311   while ((j > 0U) && (blockSize3 > 0U))
01312   {
01313     /* Accumulator is made zero for every iteration */
01314     sum = 0;
01315 
01316     /* Apply loop unrolling and compute 4 MACs simultaneously. */
01317     k = blockSize3 >> 2U;
01318 
01319     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
01320      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
01321     py++;
01322 
01323     while (k > 0U)
01324     {
01325         sum += ((q31_t) * px++ * *py--);
01326         sum += ((q31_t) * px++ * *py--);
01327         sum += ((q31_t) * px++ * *py--);
01328         sum += ((q31_t) * px++ * *py--);
01329       /* Decrement the loop counter */
01330       k--;
01331     }
01332 
01333     /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
01334      ** No loop unrolling is used. */
01335     k = blockSize3 % 0x4U;
01336 
01337     while (k > 0U)
01338     {
01339       /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
01340         sum += ((q31_t) * px++ * *py--);
01341 
01342       /* Decrement the loop counter */
01343       k--;
01344     }
01345 
01346     /* Store the result in the accumulator in the destination buffer. */
01347     *pOut++ = (q15_t) (sum >> 15);
01348 
01349     /* Update the inputA and inputB pointers for next MAC calculation */
01350     px = ++pSrc1;
01351     py = pIn2;
01352 
01353     /* Decrement the loop counter */
01354     blockSize3--;
01355 
01356     j--;
01357   }
01358 
01359   /* The second part of the stage starts here */
01360   /* SIMD is not used for the next MAC operations,
01361    * so pointer py is updated to read only one sample at a time */
01362   py = py + 1U;
01363 
01364   while (blockSize3 > 0U)
01365   {
01366     /* Accumulator is made zero for every iteration */
01367     sum = 0;
01368 
01369     /* Apply loop unrolling and compute 4 MACs simultaneously. */
01370     k = blockSize3;
01371 
01372     while (k > 0U)
01373     {
01374       /* Perform the multiply-accumulates */
01375       /* sum +=  x[srcALen-1] * y[srcBLen-1] */
01376         sum += ((q31_t) * px++ * *py--);
01377 
01378       /* Decrement the loop counter */
01379       k--;
01380     }
01381 
01382     /* Store the result in the accumulator in the destination buffer. */
01383     *pOut++ = (q15_t) (sum >> 15);
01384 
01385     /* Update the inputA and inputB pointers for next MAC calculation */
01386     px = ++pSrc1;
01387     py = pSrc2;
01388 
01389     /* Decrement the loop counter */
01390     blockSize3--;
01391   }
01392 
01393 #endif  /*  #ifndef UNALIGNED_SUPPORT_DISABLE   */
01394 }
01395 
01396 /**
01397  * @} end of Conv group
01398  */
01399