Robert Lopez / CMSIS5
Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_conv_partial_fast_q15.c Source File

arm_conv_partial_fast_q15.c

00001 /* ----------------------------------------------------------------------
00002  * Project:      CMSIS DSP Library
00003  * Title:        arm_conv_partial_fast_q15.c
00004  * Description:  Fast Q15 Partial convolution
00005  *
00006  * $Date:        27. January 2017
00007  * $Revision:    V.1.5.1
00008  *
00009  * Target Processor: Cortex-M cores
00010  * -------------------------------------------------------------------- */
00011 /*
00012  * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
00013  *
00014  * SPDX-License-Identifier: Apache-2.0
00015  *
00016  * Licensed under the Apache License, Version 2.0 (the License); you may
00017  * not use this file except in compliance with the License.
00018  * You may obtain a copy of the License at
00019  *
00020  * www.apache.org/licenses/LICENSE-2.0
00021  *
00022  * Unless required by applicable law or agreed to in writing, software
00023  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
00024  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00025  * See the License for the specific language governing permissions and
00026  * limitations under the License.
00027  */
00028 
00029 #include "arm_math.h"
00030 
00031 /**
00032  * @ingroup groupFilters
00033  */
00034 
00035 /**
00036  * @addtogroup PartialConv
00037  * @{
00038  */
00039 
00040 /**
00041  * @brief Partial convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4.
00042  * @param[in]       *pSrcA points to the first input sequence.
00043  * @param[in]       srcALen length of the first input sequence.
00044  * @param[in]       *pSrcB points to the second input sequence.
00045  * @param[in]       srcBLen length of the second input sequence.
00046  * @param[out]      *pDst points to the location where the output result is written.
00047  * @param[in]       firstIndex is the first output sample to start with.
00048  * @param[in]       numPoints is the number of output points to be computed.
00049  * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
00050  *
00051  * See <code>arm_conv_partial_q15()</code> for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion.
00052  */
00053 
00054 
00055 arm_status arm_conv_partial_fast_q15(
00056   q15_t * pSrcA,
00057   uint32_t srcALen,
00058   q15_t * pSrcB,
00059   uint32_t srcBLen,
00060   q15_t * pDst,
00061   uint32_t firstIndex,
00062   uint32_t numPoints)
00063 {
00064 #ifndef UNALIGNED_SUPPORT_DISABLE
00065 
00066   q15_t *pIn1;                                   /* inputA pointer               */
00067   q15_t *pIn2;                                   /* inputB pointer               */
00068   q15_t *pOut = pDst;                            /* output pointer               */
00069   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulator                  */
00070   q15_t *px;                                     /* Intermediate inputA pointer  */
00071   q15_t *py;                                     /* Intermediate inputB pointer  */
00072   q15_t *pSrc1, *pSrc2;                          /* Intermediate pointers        */
00073   q31_t x0, x1, x2, x3, c0;
00074   uint32_t j, k, count, check, blkCnt;
00075   int32_t blockSize1, blockSize2, blockSize3;    /* loop counters                 */
00076   arm_status status;                             /* status of Partial convolution */
00077 
00078   /* Check for range of output samples to be calculated */
00079   if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
00080   {
00081     /* Set status as ARM_MATH_ARGUMENT_ERROR */
00082     status = ARM_MATH_ARGUMENT_ERROR;
00083   }
00084   else
00085   {
00086 
00087     /* The algorithm implementation is based on the lengths of the inputs. */
00088     /* srcB is always made to slide across srcA. */
00089     /* So srcBLen is always considered as shorter or equal to srcALen */
00090     if (srcALen >=srcBLen)
00091     {
00092       /* Initialization of inputA pointer */
00093       pIn1 = pSrcA;
00094 
00095       /* Initialization of inputB pointer */
00096       pIn2 = pSrcB;
00097     }
00098     else
00099     {
00100       /* Initialization of inputA pointer */
00101       pIn1 = pSrcB;
00102 
00103       /* Initialization of inputB pointer */
00104       pIn2 = pSrcA;
00105 
00106       /* srcBLen is always considered as shorter or equal to srcALen */
00107       j = srcBLen;
00108       srcBLen = srcALen;
00109       srcALen = j;
00110     }
00111 
00112     /* Conditions to check which loopCounter holds
00113      * the first and last indices of the output samples to be calculated. */
00114     check = firstIndex + numPoints;
00115     blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
00116     blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
00117     blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);
00118     blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 :
00119                                      (int32_t) numPoints) : 0;
00120     blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +
00121                                     (int32_t) firstIndex);
00122     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
00123 
00124     /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
00125     /* The function is internally
00126      * divided into three stages according to the number of multiplications that has to be
00127      * taken place between inputA samples and inputB samples. In the first stage of the
00128      * algorithm, the multiplications increase by one for every iteration.
00129      * In the second stage of the algorithm, srcBLen number of multiplications are done.
00130      * In the third stage of the algorithm, the multiplications decrease by one
00131      * for every iteration. */
00132 
00133     /* Set the output pointer to point to the firstIndex
00134      * of the output sample to be calculated. */
00135     pOut = pDst + firstIndex;
00136 
00137     /* --------------------------
00138      * Initializations of stage1
00139      * -------------------------*/
00140 
00141     /* sum = x[0] * y[0]
00142      * sum = x[0] * y[1] + x[1] * y[0]
00143      * ....
00144      * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
00145      */
00146 
00147     /* In this stage the MAC operations are increased by 1 for every iteration.
00148        The count variable holds the number of MAC operations performed.
00149        Since the partial convolution starts from firstIndex
00150        Number of Macs to be performed is firstIndex + 1 */
00151     count = 1U + firstIndex;
00152 
00153     /* Working pointer of inputA */
00154     px = pIn1;
00155 
00156     /* Working pointer of inputB */
00157     pSrc2 = pIn2 + firstIndex;
00158     py = pSrc2;
00159 
00160     /* ------------------------
00161      * Stage1 process
00162      * ----------------------*/
00163 
00164     /* For loop unrolling by 4, this stage is divided into two. */
00165     /* First part of this stage computes the MAC operations less than 4 */
00166     /* Second part of this stage computes the MAC operations greater than or equal to 4 */
00167 
00168     /* The first part of the stage starts here */
00169     while ((count < 4U) && (blockSize1 > 0))
00170     {
00171       /* Accumulator is made zero for every iteration */
00172       sum = 0;
00173 
00174       /* Loop over number of MAC operations between
00175        * inputA samples and inputB samples */
00176       k = count;
00177 
00178       while (k > 0U)
00179       {
00180         /* Perform the multiply-accumulates */
00181         sum = __SMLAD(*px++, *py--, sum);
00182 
00183         /* Decrement the loop counter */
00184         k--;
00185       }
00186 
00187       /* Store the result in the accumulator in the destination buffer. */
00188       *pOut++ = (q15_t) (sum >> 15);
00189 
00190       /* Update the inputA and inputB pointers for next MAC calculation */
00191       py = ++pSrc2;
00192       px = pIn1;
00193 
00194       /* Increment the MAC count */
00195       count++;
00196 
00197       /* Decrement the loop counter */
00198       blockSize1--;
00199     }
00200 
00201     /* The second part of the stage starts here */
00202     /* The internal loop, over count, is unrolled by 4 */
00203     /* To, read the last two inputB samples using SIMD:
00204      * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
00205     py = py - 1;
00206 
00207     while (blockSize1 > 0)
00208     {
00209       /* Accumulator is made zero for every iteration */
00210       sum = 0;
00211 
00212       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00213       k = count >> 2U;
00214 
00215       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00216        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00217       while (k > 0U)
00218       {
00219         /* Perform the multiply-accumulates */
00220         /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
00221         sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
00222         /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
00223         sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
00224 
00225         /* Decrement the loop counter */
00226         k--;
00227       }
00228 
00229       /* For the next MAC operations, the pointer py is used without SIMD
00230        * So, py is incremented by 1 */
00231       py = py + 1U;
00232 
00233       /* If the count is not a multiple of 4, compute any remaining MACs here.
00234        ** No loop unrolling is used. */
00235       k = count % 0x4U;
00236 
00237       while (k > 0U)
00238       {
00239         /* Perform the multiply-accumulates */
00240         sum = __SMLAD(*px++, *py--, sum);
00241 
00242         /* Decrement the loop counter */
00243         k--;
00244       }
00245 
00246       /* Store the result in the accumulator in the destination buffer. */
00247       *pOut++ = (q15_t) (sum >> 15);
00248 
00249       /* Update the inputA and inputB pointers for next MAC calculation */
00250       py = ++pSrc2 - 1U;
00251       px = pIn1;
00252 
00253       /* Increment the MAC count */
00254       count++;
00255 
00256       /* Decrement the loop counter */
00257       blockSize1--;
00258     }
00259 
00260     /* --------------------------
00261      * Initializations of stage2
00262      * ------------------------*/
00263 
00264     /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
00265      * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
00266      * ....
00267      * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
00268      */
00269 
00270     /* Working pointer of inputA */
00271     if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
00272     {
00273       px = pIn1 + firstIndex - srcBLen + 1;
00274     }
00275     else
00276     {
00277       px = pIn1;
00278     }
00279 
00280     /* Working pointer of inputB */
00281     pSrc2 = pIn2 + (srcBLen - 1U);
00282     py = pSrc2;
00283 
00284     /* count is the index by which the pointer pIn1 to be incremented */
00285     count = 0U;
00286 
00287 
00288     /* --------------------
00289      * Stage2 process
00290      * -------------------*/
00291 
00292     /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
00293      * So, to loop unroll over blockSize2,
00294      * srcBLen should be greater than or equal to 4 */
00295     if (srcBLen >= 4U)
00296     {
00297       /* Loop unroll over blockSize2, by 4 */
00298       blkCnt = ((uint32_t) blockSize2 >> 2U);
00299 
00300       while (blkCnt > 0U)
00301       {
00302       py = py - 1U;
00303 
00304         /* Set all accumulators to zero */
00305         acc0 = 0;
00306         acc1 = 0;
00307         acc2 = 0;
00308         acc3 = 0;
00309 
00310 
00311         /* read x[0], x[1] samples */
00312       x0 = *__SIMD32(px);
00313         /* read x[1], x[2] samples */
00314       x1 = _SIMD32_OFFSET(px+1);
00315       px+= 2U;
00316 
00317 
00318         /* Apply loop unrolling and compute 4 MACs simultaneously. */
00319         k = srcBLen >> 2U;
00320 
00321         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00322          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00323         do
00324         {
00325           /* Read the last two inputB samples using SIMD:
00326            * y[srcBLen - 1] and y[srcBLen - 2] */
00327         c0 = *__SIMD32(py)--;
00328 
00329           /* acc0 +=  x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
00330           acc0 = __SMLADX(x0, c0, acc0);
00331 
00332           /* acc1 +=  x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
00333           acc1 = __SMLADX(x1, c0, acc1);
00334 
00335           /* Read x[2], x[3] */
00336         x2 = *__SIMD32(px);
00337 
00338           /* Read x[3], x[4] */
00339         x3 = _SIMD32_OFFSET(px+1);
00340 
00341           /* acc2 +=  x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
00342           acc2 = __SMLADX(x2, c0, acc2);
00343 
00344           /* acc3 +=  x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
00345           acc3 = __SMLADX(x3, c0, acc3);
00346 
00347           /* Read y[srcBLen - 3] and y[srcBLen - 4] */
00348         c0 = *__SIMD32(py)--;
00349 
00350           /* acc0 +=  x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
00351           acc0 = __SMLADX(x2, c0, acc0);
00352 
00353           /* acc1 +=  x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
00354           acc1 = __SMLADX(x3, c0, acc1);
00355 
00356           /* Read x[4], x[5] */
00357         x0 = _SIMD32_OFFSET(px+2);
00358 
00359           /* Read x[5], x[6] */
00360         x1 = _SIMD32_OFFSET(px+3);
00361         px += 4U;
00362 
00363           /* acc2 +=  x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
00364           acc2 = __SMLADX(x0, c0, acc2);
00365 
00366           /* acc3 +=  x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
00367           acc3 = __SMLADX(x1, c0, acc3);
00368 
00369         } while (--k);
00370 
00371         /* For the next MAC operations, SIMD is not used
00372          * So, the 16 bit pointer if inputB, py is updated */
00373 
00374         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
00375          ** No loop unrolling is used. */
00376         k = srcBLen % 0x4U;
00377 
00378         if (k == 1U)
00379         {
00380           /* Read y[srcBLen - 5] */
00381         c0 = *(py+1);
00382 #ifdef  ARM_MATH_BIG_ENDIAN
00383 
00384         c0 = c0 << 16U;
00385 
00386 #else
00387 
00388         c0 = c0 & 0x0000FFFF;
00389 
00390 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
00391 
00392           /* Read x[7] */
00393         x3 = *__SIMD32(px);
00394         px++;
00395 
00396           /* Perform the multiply-accumulates */
00397           acc0 = __SMLAD(x0, c0, acc0);
00398           acc1 = __SMLAD(x1, c0, acc1);
00399           acc2 = __SMLADX(x1, c0, acc2);
00400           acc3 = __SMLADX(x3, c0, acc3);
00401         }
00402 
00403         if (k == 2U)
00404         {
00405           /* Read y[srcBLen - 5], y[srcBLen - 6] */
00406         c0 = _SIMD32_OFFSET(py);
00407 
00408           /* Read x[7], x[8] */
00409         x3 = *__SIMD32(px);
00410 
00411         /* Read x[9] */
00412         x2 = _SIMD32_OFFSET(px+1);
00413         px += 2U;
00414 
00415           /* Perform the multiply-accumulates */
00416           acc0 = __SMLADX(x0, c0, acc0);
00417           acc1 = __SMLADX(x1, c0, acc1);
00418           acc2 = __SMLADX(x3, c0, acc2);
00419           acc3 = __SMLADX(x2, c0, acc3);
00420         }
00421 
00422         if (k == 3U)
00423         {
00424           /* Read y[srcBLen - 5], y[srcBLen - 6] */
00425         c0 = _SIMD32_OFFSET(py);
00426 
00427           /* Read x[7], x[8] */
00428         x3 = *__SIMD32(px);
00429 
00430           /* Read x[9] */
00431         x2 = _SIMD32_OFFSET(px+1);
00432 
00433           /* Perform the multiply-accumulates */
00434           acc0 = __SMLADX(x0, c0, acc0);
00435           acc1 = __SMLADX(x1, c0, acc1);
00436           acc2 = __SMLADX(x3, c0, acc2);
00437           acc3 = __SMLADX(x2, c0, acc3);
00438 
00439         c0 = *(py-1);
00440 #ifdef  ARM_MATH_BIG_ENDIAN
00441 
00442         c0 = c0 << 16U;
00443 #else
00444 
00445         c0 = c0 & 0x0000FFFF;
00446 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
00447 
00448           /* Read x[10] */
00449         x3 =  _SIMD32_OFFSET(px+2);
00450         px += 3U;
00451 
00452           /* Perform the multiply-accumulates */
00453           acc0 = __SMLADX(x1, c0, acc0);
00454           acc1 = __SMLAD(x2, c0, acc1);
00455           acc2 = __SMLADX(x2, c0, acc2);
00456           acc3 = __SMLADX(x3, c0, acc3);
00457         }
00458 
00459         /* Store the results in the accumulators in the destination buffer. */
00460 #ifndef ARM_MATH_BIG_ENDIAN
00461 
00462         *__SIMD32(pOut)++ = __PKHBT(acc0 >> 15, acc1 >> 15, 16);
00463         *__SIMD32(pOut)++ = __PKHBT(acc2 >> 15, acc3 >> 15, 16);
00464 
00465 #else
00466 
00467         *__SIMD32(pOut)++ = __PKHBT(acc1 >> 15, acc0 >> 15, 16);
00468         *__SIMD32(pOut)++ = __PKHBT(acc3 >> 15, acc2 >> 15, 16);
00469 
00470 #endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */
00471 
00472         /* Increment the pointer pIn1 index, count by 4 */
00473         count += 4U;
00474 
00475         /* Update the inputA and inputB pointers for next MAC calculation */
00476         px = pIn1 + count;
00477         py = pSrc2;
00478 
00479         /* Decrement the loop counter */
00480         blkCnt--;
00481       }
00482 
00483       /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
00484        ** No loop unrolling is used. */
00485       blkCnt = (uint32_t) blockSize2 % 0x4U;
00486 
00487       while (blkCnt > 0U)
00488       {
00489         /* Accumulator is made zero for every iteration */
00490         sum = 0;
00491 
00492         /* Apply loop unrolling and compute 4 MACs simultaneously. */
00493         k = srcBLen >> 2U;
00494 
00495         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00496          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00497         while (k > 0U)
00498         {
00499           /* Perform the multiply-accumulates */
00500           sum += ((q31_t) * px++ * *py--);
00501           sum += ((q31_t) * px++ * *py--);
00502           sum += ((q31_t) * px++ * *py--);
00503           sum += ((q31_t) * px++ * *py--);
00504 
00505           /* Decrement the loop counter */
00506           k--;
00507         }
00508 
00509         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
00510          ** No loop unrolling is used. */
00511         k = srcBLen % 0x4U;
00512 
00513         while (k > 0U)
00514         {
00515           /* Perform the multiply-accumulates */
00516           sum += ((q31_t) * px++ * *py--);
00517 
00518           /* Decrement the loop counter */
00519           k--;
00520         }
00521 
00522         /* Store the result in the accumulator in the destination buffer. */
00523         *pOut++ = (q15_t) (sum >> 15);
00524 
00525         /* Increment the pointer pIn1 index, count by 1 */
00526         count++;
00527 
00528         /* Update the inputA and inputB pointers for next MAC calculation */
00529         if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
00530         {
00531           px = pIn1 + firstIndex - srcBLen + 1 + count;
00532         }
00533         else
00534         {
00535           px = pIn1 + count;
00536         }
00537         py = pSrc2;
00538 
00539         /* Decrement the loop counter */
00540         blkCnt--;
00541       }
00542     }
00543     else
00544     {
00545       /* If the srcBLen is not a multiple of 4,
00546        * the blockSize2 loop cannot be unrolled by 4 */
00547       blkCnt = (uint32_t) blockSize2;
00548 
00549       while (blkCnt > 0U)
00550       {
00551         /* Accumulator is made zero for every iteration */
00552         sum = 0;
00553 
00554         /* srcBLen number of MACS should be performed */
00555         k = srcBLen;
00556 
00557         while (k > 0U)
00558         {
00559           /* Perform the multiply-accumulate */
00560           sum += ((q31_t) * px++ * *py--);
00561 
00562           /* Decrement the loop counter */
00563           k--;
00564         }
00565 
00566         /* Store the result in the accumulator in the destination buffer. */
00567         *pOut++ = (q15_t) (sum >> 15);
00568 
00569         /* Increment the MAC count */
00570         count++;
00571 
00572         /* Update the inputA and inputB pointers for next MAC calculation */
00573         if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
00574         {
00575           px = pIn1 + firstIndex - srcBLen + 1 + count;
00576         }
00577         else
00578         {
00579           px = pIn1 + count;
00580         }
00581         py = pSrc2;
00582 
00583         /* Decrement the loop counter */
00584         blkCnt--;
00585       }
00586     }
00587 
00588 
00589     /* --------------------------
00590      * Initializations of stage3
00591      * -------------------------*/
00592 
00593     /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
00594      * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
00595      * ....
00596      * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
00597      * sum +=  x[srcALen-1] * y[srcBLen-1]
00598      */
00599 
00600     /* In this stage the MAC operations are decreased by 1 for every iteration.
00601        The count variable holds the number of MAC operations performed */
00602     count = srcBLen - 1U;
00603 
00604     /* Working pointer of inputA */
00605     pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
00606     px = pSrc1;
00607 
00608     /* Working pointer of inputB */
00609     pSrc2 = pIn2 + (srcBLen - 1U);
00610     pIn2 = pSrc2 - 1U;
00611     py = pIn2;
00612 
00613     /* -------------------
00614      * Stage3 process
00615      * ------------------*/
00616 
00617     /* For loop unrolling by 4, this stage is divided into two. */
00618     /* First part of this stage computes the MAC operations greater than 4 */
00619     /* Second part of this stage computes the MAC operations less than or equal to 4 */
00620 
00621     /* The first part of the stage starts here */
00622     j = count >> 2U;
00623 
00624     while ((j > 0U) && (blockSize3 > 0))
00625     {
00626       /* Accumulator is made zero for every iteration */
00627       sum = 0;
00628 
00629       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00630       k = count >> 2U;
00631 
00632       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00633        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00634       while (k > 0U)
00635       {
00636         /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied
00637          * with y[srcBLen - 1], y[srcBLen - 2] respectively */
00638         sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
00639         /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied
00640          * with y[srcBLen - 3], y[srcBLen - 4] respectively */
00641         sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
00642 
00643         /* Decrement the loop counter */
00644         k--;
00645       }
00646 
00647       /* For the next MAC operations, the pointer py is used without SIMD
00648        * So, py is incremented by 1 */
00649       py = py + 1U;
00650 
00651       /* If the count is not a multiple of 4, compute any remaining MACs here.
00652        ** No loop unrolling is used. */
00653       k = count % 0x4U;
00654 
00655       while (k > 0U)
00656       {
00657         /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
00658         sum = __SMLAD(*px++, *py--, sum);
00659 
00660         /* Decrement the loop counter */
00661         k--;
00662       }
00663 
00664       /* Store the result in the accumulator in the destination buffer. */
00665       *pOut++ = (q15_t) (sum >> 15);
00666 
00667       /* Update the inputA and inputB pointers for next MAC calculation */
00668       px = ++pSrc1;
00669       py = pIn2;
00670 
00671       /* Decrement the MAC count */
00672       count--;
00673 
00674       /* Decrement the loop counter */
00675       blockSize3--;
00676 
00677       j--;
00678     }
00679 
00680     /* The second part of the stage starts here */
00681     /* SIMD is not used for the next MAC operations,
00682      * so pointer py is updated to read only one sample at a time */
00683     py = py + 1U;
00684 
00685     while (blockSize3 > 0)
00686     {
00687       /* Accumulator is made zero for every iteration */
00688       sum = 0;
00689 
00690       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00691       k = count;
00692 
00693       while (k > 0U)
00694       {
00695         /* Perform the multiply-accumulates */
00696         /* sum +=  x[srcALen-1] * y[srcBLen-1] */
00697         sum = __SMLAD(*px++, *py--, sum);
00698 
00699         /* Decrement the loop counter */
00700         k--;
00701       }
00702 
00703       /* Store the result in the accumulator in the destination buffer. */
00704       *pOut++ = (q15_t) (sum >> 15);
00705 
00706       /* Update the inputA and inputB pointers for next MAC calculation */
00707       px = ++pSrc1;
00708       py = pSrc2;
00709 
00710       /* Decrement the MAC count */
00711       count--;
00712 
00713       /* Decrement the loop counter */
00714       blockSize3--;
00715     }
00716 
00717     /* set status as ARM_MATH_SUCCESS */
00718     status = ARM_MATH_SUCCESS;
00719   }
00720 
00721   /* Return to application */
00722   return (status);
00723 
00724 #else
00725 
00726   q15_t *pIn1;                                   /* inputA pointer               */
00727   q15_t *pIn2;                                   /* inputB pointer               */
00728   q15_t *pOut = pDst;                            /* output pointer               */
00729   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulator                  */
00730   q15_t *px;                                     /* Intermediate inputA pointer  */
00731   q15_t *py;                                     /* Intermediate inputB pointer  */
00732   q15_t *pSrc1, *pSrc2;                          /* Intermediate pointers        */
00733   q31_t x0, x1, x2, x3, c0;
00734   uint32_t j, k, count, check, blkCnt;
00735   int32_t blockSize1, blockSize2, blockSize3;    /* loop counters                 */
00736   arm_status status;                             /* status of Partial convolution */
00737   q15_t a, b;
00738 
00739   /* Check for range of output samples to be calculated */
00740   if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
00741   {
00742     /* Set status as ARM_MATH_ARGUMENT_ERROR */
00743     status = ARM_MATH_ARGUMENT_ERROR;
00744   }
00745   else
00746   {
00747 
00748     /* The algorithm implementation is based on the lengths of the inputs. */
00749     /* srcB is always made to slide across srcA. */
00750     /* So srcBLen is always considered as shorter or equal to srcALen */
00751     if (srcALen >=srcBLen)
00752     {
00753       /* Initialization of inputA pointer */
00754       pIn1 = pSrcA;
00755 
00756       /* Initialization of inputB pointer */
00757       pIn2 = pSrcB;
00758     }
00759     else
00760     {
00761       /* Initialization of inputA pointer */
00762       pIn1 = pSrcB;
00763 
00764       /* Initialization of inputB pointer */
00765       pIn2 = pSrcA;
00766 
00767       /* srcBLen is always considered as shorter or equal to srcALen */
00768       j = srcBLen;
00769       srcBLen = srcALen;
00770       srcALen = j;
00771     }
00772 
00773     /* Conditions to check which loopCounter holds
00774      * the first and last indices of the output samples to be calculated. */
00775     check = firstIndex + numPoints;
00776     blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
00777     blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
00778     blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex;
00779     blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 :
00780                                      (int32_t) numPoints) : 0;
00781     blockSize2 = ((int32_t) check - blockSize3) -
00782       (blockSize1 + (int32_t) firstIndex);
00783     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
00784 
00785     /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
00786     /* The function is internally
00787      * divided into three stages according to the number of multiplications that has to be
00788      * taken place between inputA samples and inputB samples. In the first stage of the
00789      * algorithm, the multiplications increase by one for every iteration.
00790      * In the second stage of the algorithm, srcBLen number of multiplications are done.
00791      * In the third stage of the algorithm, the multiplications decrease by one
00792      * for every iteration. */
00793 
00794     /* Set the output pointer to point to the firstIndex
00795      * of the output sample to be calculated. */
00796     pOut = pDst + firstIndex;
00797 
00798     /* --------------------------
00799      * Initializations of stage1
00800      * -------------------------*/
00801 
00802     /* sum = x[0] * y[0]
00803      * sum = x[0] * y[1] + x[1] * y[0]
00804      * ....
00805      * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
00806      */
00807 
00808     /* In this stage the MAC operations are increased by 1 for every iteration.
00809        The count variable holds the number of MAC operations performed.
00810        Since the partial convolution starts from firstIndex
00811        Number of Macs to be performed is firstIndex + 1 */
00812     count = 1U + firstIndex;
00813 
00814     /* Working pointer of inputA */
00815     px = pIn1;
00816 
00817     /* Working pointer of inputB */
00818     pSrc2 = pIn2 + firstIndex;
00819     py = pSrc2;
00820 
00821     /* ------------------------
00822      * Stage1 process
00823      * ----------------------*/
00824 
00825     /* For loop unrolling by 4, this stage is divided into two. */
00826     /* First part of this stage computes the MAC operations less than 4 */
00827     /* Second part of this stage computes the MAC operations greater than or equal to 4 */
00828 
00829     /* The first part of the stage starts here */
00830   while ((count < 4U) && (blockSize1 > 0))
00831     {
00832       /* Accumulator is made zero for every iteration */
00833       sum = 0;
00834 
00835       /* Loop over number of MAC operations between
00836        * inputA samples and inputB samples */
00837       k = count;
00838 
00839       while (k > 0U)
00840       {
00841         /* Perform the multiply-accumulates */
00842       sum += ((q31_t) * px++ * *py--);
00843 
00844         /* Decrement the loop counter */
00845         k--;
00846       }
00847 
00848       /* Store the result in the accumulator in the destination buffer. */
00849       *pOut++ = (q15_t) (sum >> 15);
00850 
00851       /* Update the inputA and inputB pointers for next MAC calculation */
00852       py = ++pSrc2;
00853       px = pIn1;
00854 
00855       /* Increment the MAC count */
00856       count++;
00857 
00858       /* Decrement the loop counter */
00859       blockSize1--;
00860     }
00861 
00862     /* The second part of the stage starts here */
00863     /* The internal loop, over count, is unrolled by 4 */
00864     /* To, read the last two inputB samples using SIMD:
00865      * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
00866     py = py - 1;
00867 
00868   while (blockSize1 > 0)
00869     {
00870       /* Accumulator is made zero for every iteration */
00871       sum = 0;
00872 
00873       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00874       k = count >> 2U;
00875 
00876       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00877        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00878     py++;
00879 
00880     while (k > 0U)
00881     {
00882       /* Perform the multiply-accumulates */
00883         sum += ((q31_t) * px++ * *py--);
00884         sum += ((q31_t) * px++ * *py--);
00885         sum += ((q31_t) * px++ * *py--);
00886         sum += ((q31_t) * px++ * *py--);
00887 
00888       /* Decrement the loop counter */
00889       k--;
00890     }
00891 
00892       /* If the count is not a multiple of 4, compute any remaining MACs here.
00893        ** No loop unrolling is used. */
00894       k = count % 0x4U;
00895 
00896       while (k > 0U)
00897       {
00898         /* Perform the multiply-accumulates */
00899       sum += ((q31_t) * px++ * *py--);
00900 
00901         /* Decrement the loop counter */
00902         k--;
00903       }
00904 
00905       /* Store the result in the accumulator in the destination buffer. */
00906       *pOut++ = (q15_t) (sum >> 15);
00907 
00908       /* Update the inputA and inputB pointers for next MAC calculation */
00909       py = ++pSrc2 - 1U;
00910       px = pIn1;
00911 
00912       /* Increment the MAC count */
00913       count++;
00914 
00915       /* Decrement the loop counter */
00916       blockSize1--;
00917     }
00918 
00919     /* --------------------------
00920      * Initializations of stage2
00921      * ------------------------*/
00922 
00923     /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
00924      * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
00925      * ....
00926      * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
00927      */
00928 
00929     /* Working pointer of inputA */
00930     if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
00931     {
00932       px = pIn1 + firstIndex - srcBLen + 1;
00933     }
00934     else
00935     {
00936       px = pIn1;
00937     }
00938 
00939     /* Working pointer of inputB */
00940     pSrc2 = pIn2 + (srcBLen - 1U);
00941     py = pSrc2;
00942 
00943     /* count is the index by which the pointer pIn1 to be incremented */
00944     count = 0U;
00945 
00946 
00947     /* --------------------
00948      * Stage2 process
00949      * -------------------*/
00950 
00951     /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
00952      * So, to loop unroll over blockSize2,
00953      * srcBLen should be greater than or equal to 4 */
00954     if (srcBLen >= 4U)
00955     {
00956       /* Loop unroll over blockSize2, by 4 */
00957       blkCnt = ((uint32_t) blockSize2 >> 2U);
00958 
00959       while (blkCnt > 0U)
00960       {
00961       py = py - 1U;
00962 
00963         /* Set all accumulators to zero */
00964         acc0 = 0;
00965         acc1 = 0;
00966         acc2 = 0;
00967         acc3 = 0;
00968 
00969       /* read x[0], x[1] samples */
00970       a = *px++;
00971       b = *px++;
00972 
00973 #ifndef ARM_MATH_BIG_ENDIAN
00974 
00975       x0 = __PKHBT(a, b, 16);
00976       a = *px;
00977       x1 = __PKHBT(b, a, 16);
00978 
00979 #else
00980 
00981       x0 = __PKHBT(b, a, 16);
00982       a = *px;
00983       x1 = __PKHBT(a, b, 16);
00984 
00985 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN    */
00986 
00987       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00988       k = srcBLen >> 2U;
00989 
00990       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00991        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00992       do
00993       {
00994         /* Read the last two inputB samples using SIMD:
00995          * y[srcBLen - 1] and y[srcBLen - 2] */
00996         a = *py;
00997         b = *(py+1);
00998         py -= 2;
00999 
01000 #ifndef ARM_MATH_BIG_ENDIAN
01001 
01002         c0 = __PKHBT(a, b, 16);
01003 
01004 #else
01005 
01006         c0 = __PKHBT(b, a, 16);;
01007 
01008 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN */
01009 
01010         /* acc0 +=  x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
01011         acc0 = __SMLADX(x0, c0, acc0);
01012 
01013         /* acc1 +=  x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
01014         acc1 = __SMLADX(x1, c0, acc1);
01015 
01016       a = *px;
01017       b = *(px + 1);
01018 
01019 #ifndef ARM_MATH_BIG_ENDIAN
01020 
01021       x2 = __PKHBT(a, b, 16);
01022       a = *(px + 2);
01023       x3 = __PKHBT(b, a, 16);
01024 
01025 #else
01026 
01027       x2 = __PKHBT(b, a, 16);
01028       a = *(px + 2);
01029       x3 = __PKHBT(a, b, 16);
01030 
01031 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN    */
01032 
01033         /* acc2 +=  x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
01034         acc2 = __SMLADX(x2, c0, acc2);
01035 
01036         /* acc3 +=  x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
01037         acc3 = __SMLADX(x3, c0, acc3);
01038 
01039         /* Read y[srcBLen - 3] and y[srcBLen - 4] */
01040         a = *py;
01041         b = *(py+1);
01042         py -= 2;
01043 
01044 #ifndef ARM_MATH_BIG_ENDIAN
01045 
01046         c0 = __PKHBT(a, b, 16);
01047 
01048 #else
01049 
01050         c0 = __PKHBT(b, a, 16);;
01051 
01052 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN */
01053 
01054         /* acc0 +=  x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
01055         acc0 = __SMLADX(x2, c0, acc0);
01056 
01057         /* acc1 +=  x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
01058         acc1 = __SMLADX(x3, c0, acc1);
01059 
01060         /* Read x[4], x[5], x[6] */
01061       a = *(px + 2);
01062       b = *(px + 3);
01063 
01064 #ifndef ARM_MATH_BIG_ENDIAN
01065 
01066       x0 = __PKHBT(a, b, 16);
01067       a = *(px + 4);
01068       x1 = __PKHBT(b, a, 16);
01069 
01070 #else
01071 
01072       x0 = __PKHBT(b, a, 16);
01073       a = *(px + 4);
01074       x1 = __PKHBT(a, b, 16);
01075 
01076 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN    */
01077 
01078         px += 4U;
01079 
01080         /* acc2 +=  x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
01081         acc2 = __SMLADX(x0, c0, acc2);
01082 
01083         /* acc3 +=  x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
01084         acc3 = __SMLADX(x1, c0, acc3);
01085 
01086       } while (--k);
01087 
01088       /* For the next MAC operations, SIMD is not used
01089        * So, the 16 bit pointer if inputB, py is updated */
01090 
01091       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
01092        ** No loop unrolling is used. */
01093       k = srcBLen % 0x4U;
01094 
01095       if (k == 1U)
01096       {
01097         /* Read y[srcBLen - 5] */
01098         c0 = *(py+1);
01099 
01100 #ifdef  ARM_MATH_BIG_ENDIAN
01101 
01102         c0 = c0 << 16U;
01103 
01104 #else
01105 
01106         c0 = c0 & 0x0000FFFF;
01107 
01108 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
01109 
01110         /* Read x[7] */
01111         a = *px;
01112         b = *(px+1);
01113         px++;
01114 
01115 #ifndef ARM_MATH_BIG_ENDIAN
01116 
01117         x3 = __PKHBT(a, b, 16);
01118 
01119 #else
01120 
01121         x3 = __PKHBT(b, a, 16);;
01122 
01123 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN */
01124 
01125 
01126         /* Perform the multiply-accumulates */
01127         acc0 = __SMLAD(x0, c0, acc0);
01128         acc1 = __SMLAD(x1, c0, acc1);
01129         acc2 = __SMLADX(x1, c0, acc2);
01130         acc3 = __SMLADX(x3, c0, acc3);
01131       }
01132 
01133       if (k == 2U)
01134       {
01135         /* Read y[srcBLen - 5], y[srcBLen - 6] */
01136         a = *py;
01137         b = *(py+1);
01138 
01139 #ifndef ARM_MATH_BIG_ENDIAN
01140 
01141         c0 = __PKHBT(a, b, 16);
01142 
01143 #else
01144 
01145         c0 = __PKHBT(b, a, 16);;
01146 
01147 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN */
01148 
01149         /* Read x[7], x[8], x[9] */
01150       a = *px;
01151       b = *(px + 1);
01152 
01153 #ifndef ARM_MATH_BIG_ENDIAN
01154 
01155       x3 = __PKHBT(a, b, 16);
01156       a = *(px + 2);
01157       x2 = __PKHBT(b, a, 16);
01158 
01159 #else
01160 
01161       x3 = __PKHBT(b, a, 16);
01162       a = *(px + 2);
01163       x2 = __PKHBT(a, b, 16);
01164 
01165 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN    */
01166         px += 2U;
01167 
01168         /* Perform the multiply-accumulates */
01169         acc0 = __SMLADX(x0, c0, acc0);
01170         acc1 = __SMLADX(x1, c0, acc1);
01171         acc2 = __SMLADX(x3, c0, acc2);
01172         acc3 = __SMLADX(x2, c0, acc3);
01173       }
01174 
01175       if (k == 3U)
01176       {
01177         /* Read y[srcBLen - 5], y[srcBLen - 6] */
01178         a = *py;
01179         b = *(py+1);
01180 
01181 #ifndef ARM_MATH_BIG_ENDIAN
01182 
01183         c0 = __PKHBT(a, b, 16);
01184 
01185 #else
01186 
01187         c0 = __PKHBT(b, a, 16);;
01188 
01189 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN */
01190 
01191         /* Read x[7], x[8], x[9] */
01192       a = *px;
01193       b = *(px + 1);
01194 
01195 #ifndef ARM_MATH_BIG_ENDIAN
01196 
01197       x3 = __PKHBT(a, b, 16);
01198       a = *(px + 2);
01199       x2 = __PKHBT(b, a, 16);
01200 
01201 #else
01202 
01203       x3 = __PKHBT(b, a, 16);
01204       a = *(px + 2);
01205       x2 = __PKHBT(a, b, 16);
01206 
01207 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN    */
01208 
01209         /* Perform the multiply-accumulates */
01210         acc0 = __SMLADX(x0, c0, acc0);
01211         acc1 = __SMLADX(x1, c0, acc1);
01212         acc2 = __SMLADX(x3, c0, acc2);
01213         acc3 = __SMLADX(x2, c0, acc3);
01214 
01215         /* Read y[srcBLen - 7] */
01216         c0 = *(py-1);
01217 #ifdef  ARM_MATH_BIG_ENDIAN
01218 
01219         c0 = c0 << 16U;
01220 #else
01221 
01222         c0 = c0 & 0x0000FFFF;
01223 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
01224 
01225         /* Read x[10] */
01226         a = *(px+2);
01227         b = *(px+3);
01228 
01229 #ifndef ARM_MATH_BIG_ENDIAN
01230 
01231         x3 = __PKHBT(a, b, 16);
01232 
01233 #else
01234 
01235         x3 = __PKHBT(b, a, 16);;
01236 
01237 #endif  /*  #ifndef ARM_MATH_BIG_ENDIAN */
01238 
01239         px += 3U;
01240 
01241         /* Perform the multiply-accumulates */
01242         acc0 = __SMLADX(x1, c0, acc0);
01243         acc1 = __SMLAD(x2, c0, acc1);
01244         acc2 = __SMLADX(x2, c0, acc2);
01245         acc3 = __SMLADX(x3, c0, acc3);
01246       }
01247 
01248       /* Store the results in the accumulators in the destination buffer. */
01249       *pOut++ = (q15_t)(acc0 >> 15);
01250       *pOut++ = (q15_t)(acc1 >> 15);
01251       *pOut++ = (q15_t)(acc2 >> 15);
01252       *pOut++ = (q15_t)(acc3 >> 15);
01253 
01254         /* Increment the pointer pIn1 index, count by 4 */
01255         count += 4U;
01256 
01257         /* Update the inputA and inputB pointers for next MAC calculation */
01258         px = pIn1 + count;
01259         py = pSrc2;
01260 
01261         /* Decrement the loop counter */
01262         blkCnt--;
01263       }
01264 
01265       /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
01266        ** No loop unrolling is used. */
01267       blkCnt = (uint32_t) blockSize2 % 0x4U;
01268 
01269       while (blkCnt > 0U)
01270       {
01271         /* Accumulator is made zero for every iteration */
01272         sum = 0;
01273 
01274         /* Apply loop unrolling and compute 4 MACs simultaneously. */
01275         k = srcBLen >> 2U;
01276 
01277         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
01278          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
01279         while (k > 0U)
01280         {
01281           /* Perform the multiply-accumulates */
01282           sum += ((q31_t) * px++ * *py--);
01283           sum += ((q31_t) * px++ * *py--);
01284           sum += ((q31_t) * px++ * *py--);
01285           sum += ((q31_t) * px++ * *py--);
01286 
01287           /* Decrement the loop counter */
01288           k--;
01289         }
01290 
01291         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
01292          ** No loop unrolling is used. */
01293         k = srcBLen % 0x4U;
01294 
01295         while (k > 0U)
01296         {
01297           /* Perform the multiply-accumulates */
01298           sum += ((q31_t) * px++ * *py--);
01299 
01300           /* Decrement the loop counter */
01301           k--;
01302         }
01303 
01304         /* Store the result in the accumulator in the destination buffer. */
01305         *pOut++ = (q15_t) (sum >> 15);
01306 
01307         /* Increment the pointer pIn1 index, count by 1 */
01308         count++;
01309 
01310         /* Update the inputA and inputB pointers for next MAC calculation */
01311         px = pIn1 + count;
01312         py = pSrc2;
01313 
01314         /* Decrement the loop counter */
01315         blkCnt--;
01316       }
01317     }
01318     else
01319     {
01320       /* If the srcBLen is not a multiple of 4,
01321        * the blockSize2 loop cannot be unrolled by 4 */
01322       blkCnt = (uint32_t) blockSize2;
01323 
01324       while (blkCnt > 0U)
01325       {
01326         /* Accumulator is made zero for every iteration */
01327         sum = 0;
01328 
01329         /* srcBLen number of MACS should be performed */
01330         k = srcBLen;
01331 
01332         while (k > 0U)
01333         {
01334           /* Perform the multiply-accumulate */
01335           sum += ((q31_t) * px++ * *py--);
01336 
01337           /* Decrement the loop counter */
01338           k--;
01339         }
01340 
01341         /* Store the result in the accumulator in the destination buffer. */
01342         *pOut++ = (q15_t) (sum >> 15);
01343 
01344         /* Increment the MAC count */
01345         count++;
01346 
01347         /* Update the inputA and inputB pointers for next MAC calculation */
01348         px = pIn1 + count;
01349         py = pSrc2;
01350 
01351         /* Decrement the loop counter */
01352         blkCnt--;
01353       }
01354     }
01355 
01356 
01357     /* --------------------------
01358      * Initializations of stage3
01359      * -------------------------*/
01360 
01361     /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
01362      * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
01363      * ....
01364      * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
01365      * sum +=  x[srcALen-1] * y[srcBLen-1]
01366      */
01367 
01368     /* In this stage the MAC operations are decreased by 1 for every iteration.
01369        The count variable holds the number of MAC operations performed */
01370     count = srcBLen - 1U;
01371 
01372     /* Working pointer of inputA */
01373     pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
01374     px = pSrc1;
01375 
01376     /* Working pointer of inputB */
01377     pSrc2 = pIn2 + (srcBLen - 1U);
01378     pIn2 = pSrc2 - 1U;
01379     py = pIn2;
01380 
01381     /* -------------------
01382      * Stage3 process
01383      * ------------------*/
01384 
01385     /* For loop unrolling by 4, this stage is divided into two. */
01386     /* First part of this stage computes the MAC operations greater than 4 */
01387     /* Second part of this stage computes the MAC operations less than or equal to 4 */
01388 
01389     /* The first part of the stage starts here */
01390     j = count >> 2U;
01391 
01392     while ((j > 0U) && (blockSize3 > 0))
01393     {
01394       /* Accumulator is made zero for every iteration */
01395       sum = 0;
01396 
01397       /* Apply loop unrolling and compute 4 MACs simultaneously. */
01398       k = count >> 2U;
01399 
01400       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
01401        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
01402     py++;
01403 
01404     while (k > 0U)
01405     {
01406       /* Perform the multiply-accumulates */
01407         sum += ((q31_t) * px++ * *py--);
01408         sum += ((q31_t) * px++ * *py--);
01409         sum += ((q31_t) * px++ * *py--);
01410         sum += ((q31_t) * px++ * *py--);
01411       /* Decrement the loop counter */
01412       k--;
01413     }
01414 
01415 
01416       /* If the count is not a multiple of 4, compute any remaining MACs here.
01417        ** No loop unrolling is used. */
01418       k = count % 0x4U;
01419 
01420       while (k > 0U)
01421       {
01422       /* Perform the multiply-accumulates */
01423         sum += ((q31_t) * px++ * *py--);
01424 
01425         /* Decrement the loop counter */
01426         k--;
01427       }
01428 
01429       /* Store the result in the accumulator in the destination buffer. */
01430       *pOut++ = (q15_t) (sum >> 15);
01431 
01432       /* Update the inputA and inputB pointers for next MAC calculation */
01433       px = ++pSrc1;
01434       py = pIn2;
01435 
01436       /* Decrement the MAC count */
01437       count--;
01438 
01439       /* Decrement the loop counter */
01440       blockSize3--;
01441 
01442       j--;
01443     }
01444 
01445     /* The second part of the stage starts here */
01446     /* SIMD is not used for the next MAC operations,
01447      * so pointer py is updated to read only one sample at a time */
01448     py = py + 1U;
01449 
01450   while (blockSize3 > 0)
01451     {
01452       /* Accumulator is made zero for every iteration */
01453       sum = 0;
01454 
01455       /* Apply loop unrolling and compute 4 MACs simultaneously. */
01456       k = count;
01457 
01458       while (k > 0U)
01459       {
01460         /* Perform the multiply-accumulates */
01461         /* sum +=  x[srcALen-1] * y[srcBLen-1] */
01462         sum += ((q31_t) * px++ * *py--);
01463 
01464         /* Decrement the loop counter */
01465         k--;
01466       }
01467 
01468       /* Store the result in the accumulator in the destination buffer. */
01469       *pOut++ = (q15_t) (sum >> 15);
01470 
01471       /* Update the inputA and inputB pointers for next MAC calculation */
01472       px = ++pSrc1;
01473       py = pSrc2;
01474 
01475       /* Decrement the MAC count */
01476       count--;
01477 
01478       /* Decrement the loop counter */
01479       blockSize3--;
01480     }
01481 
01482     /* set status as ARM_MATH_SUCCESS */
01483     status = ARM_MATH_SUCCESS;
01484   }
01485 
01486   /* Return to application */
01487   return (status);
01488 
01489 #endif /*     #ifndef UNALIGNED_SUPPORT_DISABLE      */
01490 }
01491 
01492 /**
01493  * @} end of PartialConv group
01494  */
01495