Robert Lopez / CMSIS5
Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_conv_partial_q15.c Source File

arm_conv_partial_q15.c

00001 /* ----------------------------------------------------------------------
00002  * Project:      CMSIS DSP Library
00003  * Title:        arm_conv_partial_q15.c
00004  * Description:  Partial convolution of Q15 sequences
00005  *
00006  * $Date:        27. January 2017
00007  * $Revision:    V.1.5.1
00008  *
00009  * Target Processor: Cortex-M cores
00010  * -------------------------------------------------------------------- */
00011 /*
00012  * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
00013  *
00014  * SPDX-License-Identifier: Apache-2.0
00015  *
00016  * Licensed under the Apache License, Version 2.0 (the License); you may
00017  * not use this file except in compliance with the License.
00018  * You may obtain a copy of the License at
00019  *
00020  * www.apache.org/licenses/LICENSE-2.0
00021  *
00022  * Unless required by applicable law or agreed to in writing, software
00023  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
00024  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00025  * See the License for the specific language governing permissions and
00026  * limitations under the License.
00027  */
00028 
00029 #include "arm_math.h"
00030 
00031 /**
00032  * @ingroup groupFilters
00033  */
00034 
00035 /**
00036  * @addtogroup PartialConv
00037  * @{
00038  */
00039 
00040 /**
00041  * @brief Partial convolution of Q15 sequences.
00042  * @param[in]       *pSrcA points to the first input sequence.
00043  * @param[in]       srcALen length of the first input sequence.
00044  * @param[in]       *pSrcB points to the second input sequence.
00045  * @param[in]       srcBLen length of the second input sequence.
00046  * @param[out]      *pDst points to the location where the output result is written.
00047  * @param[in]       firstIndex is the first output sample to start with.
00048  * @param[in]       numPoints is the number of output points to be computed.
00049  * @return  Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
00050  *
00051  * Refer to <code>arm_conv_partial_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.
00052  *
00053  * \par
00054  * Refer the function <code>arm_conv_partial_opt_q15()</code> for a faster implementation of this function using scratch buffers.
00055  *
00056  */
00057 
00058 arm_status arm_conv_partial_q15(
00059   q15_t * pSrcA,
00060   uint32_t srcALen,
00061   q15_t * pSrcB,
00062   uint32_t srcBLen,
00063   q15_t * pDst,
00064   uint32_t firstIndex,
00065   uint32_t numPoints)
00066 {
00067 
00068 
00069 #if (defined(ARM_MATH_CM7) || defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE)
00070 
00071   /* Run the below code for Cortex-M4 and Cortex-M3 */
00072 
00073   q15_t *pIn1;                                   /* inputA pointer               */
00074   q15_t *pIn2;                                   /* inputB pointer               */
00075   q15_t *pOut = pDst;                            /* output pointer               */
00076   q63_t sum, acc0, acc1, acc2, acc3;             /* Accumulator                  */
00077   q15_t *px;                                     /* Intermediate inputA pointer  */
00078   q15_t *py;                                     /* Intermediate inputB pointer  */
00079   q15_t *pSrc1, *pSrc2;                          /* Intermediate pointers        */
00080   q31_t x0, x1, x2, x3, c0;                      /* Temporary input variables */
00081   uint32_t j, k, count, check, blkCnt;
00082   int32_t blockSize1, blockSize2, blockSize3;    /* loop counter                 */
00083   arm_status status;                             /* status of Partial convolution */
00084 
00085   /* Check for range of output samples to be calculated */
00086   if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
00087   {
00088     /* Set status as ARM_MATH_ARGUMENT_ERROR */
00089     status = ARM_MATH_ARGUMENT_ERROR;
00090   }
00091   else
00092   {
00093 
00094     /* The algorithm implementation is based on the lengths of the inputs. */
00095     /* srcB is always made to slide across srcA. */
00096     /* So srcBLen is always considered as shorter or equal to srcALen */
00097     if (srcALen >= srcBLen)
00098     {
00099       /* Initialization of inputA pointer */
00100       pIn1 = pSrcA;
00101 
00102       /* Initialization of inputB pointer */
00103       pIn2 = pSrcB;
00104     }
00105     else
00106     {
00107       /* Initialization of inputA pointer */
00108       pIn1 = pSrcB;
00109 
00110       /* Initialization of inputB pointer */
00111       pIn2 = pSrcA;
00112 
00113       /* srcBLen is always considered as shorter or equal to srcALen */
00114       j = srcBLen;
00115       srcBLen = srcALen;
00116       srcALen = j;
00117     }
00118 
00119     /* Conditions to check which loopCounter holds
00120      * the first and last indices of the output samples to be calculated. */
00121     check = firstIndex + numPoints;
00122     blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
00123     blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
00124     blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);
00125     blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 :
00126                                      (int32_t) numPoints) : 0;
00127     blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +
00128                                     (int32_t) firstIndex);
00129     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
00130 
00131     /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
00132     /* The function is internally
00133      * divided into three stages according to the number of multiplications that has to be
00134      * taken place between inputA samples and inputB samples. In the first stage of the
00135      * algorithm, the multiplications increase by one for every iteration.
00136      * In the second stage of the algorithm, srcBLen number of multiplications are done.
00137      * In the third stage of the algorithm, the multiplications decrease by one
00138      * for every iteration. */
00139 
00140     /* Set the output pointer to point to the firstIndex
00141      * of the output sample to be calculated. */
00142     pOut = pDst + firstIndex;
00143 
00144     /* --------------------------
00145      * Initializations of stage1
00146      * -------------------------*/
00147 
00148     /* sum = x[0] * y[0]
00149      * sum = x[0] * y[1] + x[1] * y[0]
00150      * ....
00151      * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
00152      */
00153 
00154     /* In this stage the MAC operations are increased by 1 for every iteration.
00155        The count variable holds the number of MAC operations performed.
00156        Since the partial convolution starts from firstIndex
00157        Number of Macs to be performed is firstIndex + 1 */
00158     count = 1U + firstIndex;
00159 
00160     /* Working pointer of inputA */
00161     px = pIn1;
00162 
00163     /* Working pointer of inputB */
00164     pSrc2 = pIn2 + firstIndex;
00165     py = pSrc2;
00166 
00167     /* ------------------------
00168      * Stage1 process
00169      * ----------------------*/
00170 
00171     /* For loop unrolling by 4, this stage is divided into two. */
00172     /* First part of this stage computes the MAC operations less than 4 */
00173     /* Second part of this stage computes the MAC operations greater than or equal to 4 */
00174 
00175     /* The first part of the stage starts here */
00176     while ((count < 4U) && (blockSize1 > 0))
00177     {
00178       /* Accumulator is made zero for every iteration */
00179       sum = 0;
00180 
00181       /* Loop over number of MAC operations between
00182        * inputA samples and inputB samples */
00183       k = count;
00184 
00185       while (k > 0U)
00186       {
00187         /* Perform the multiply-accumulates */
00188         sum = __SMLALD(*px++, *py--, sum);
00189 
00190         /* Decrement the loop counter */
00191         k--;
00192       }
00193 
00194       /* Store the result in the accumulator in the destination buffer. */
00195       *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
00196 
00197       /* Update the inputA and inputB pointers for next MAC calculation */
00198       py = ++pSrc2;
00199       px = pIn1;
00200 
00201       /* Increment the MAC count */
00202       count++;
00203 
00204       /* Decrement the loop counter */
00205       blockSize1--;
00206     }
00207 
00208     /* The second part of the stage starts here */
00209     /* The internal loop, over count, is unrolled by 4 */
00210     /* To, read the last two inputB samples using SIMD:
00211      * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
00212     py = py - 1;
00213 
00214     while (blockSize1 > 0)
00215     {
00216       /* Accumulator is made zero for every iteration */
00217       sum = 0;
00218 
00219       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00220       k = count >> 2U;
00221 
00222       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00223        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00224       while (k > 0U)
00225       {
00226         /* Perform the multiply-accumulates */
00227         /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
00228         sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
00229         /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
00230         sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
00231 
00232         /* Decrement the loop counter */
00233         k--;
00234       }
00235 
00236       /* For the next MAC operations, the pointer py is used without SIMD
00237        * So, py is incremented by 1 */
00238       py = py + 1U;
00239 
00240       /* If the count is not a multiple of 4, compute any remaining MACs here.
00241        ** No loop unrolling is used. */
00242       k = count % 0x4U;
00243 
00244       while (k > 0U)
00245       {
00246         /* Perform the multiply-accumulates */
00247         sum = __SMLALD(*px++, *py--, sum);
00248 
00249         /* Decrement the loop counter */
00250         k--;
00251       }
00252 
00253       /* Store the result in the accumulator in the destination buffer. */
00254       *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
00255 
00256       /* Update the inputA and inputB pointers for next MAC calculation */
00257       py = ++pSrc2 - 1U;
00258       px = pIn1;
00259 
00260       /* Increment the MAC count */
00261       count++;
00262 
00263       /* Decrement the loop counter */
00264       blockSize1--;
00265     }
00266 
00267     /* --------------------------
00268      * Initializations of stage2
00269      * ------------------------*/
00270 
00271     /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
00272      * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
00273      * ....
00274      * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
00275      */
00276 
00277     /* Working pointer of inputA */
00278     if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
00279     {
00280       px = pIn1 + firstIndex - srcBLen + 1;
00281     }
00282     else
00283     {
00284       px = pIn1;
00285     }
00286 
00287     /* Working pointer of inputB */
00288     pSrc2 = pIn2 + (srcBLen - 1U);
00289     py = pSrc2;
00290 
00291     /* count is the index by which the pointer pIn1 to be incremented */
00292     count = 0U;
00293 
00294 
00295   /* --------------------
00296    * Stage2 process
00297    * -------------------*/
00298 
00299   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
00300    * So, to loop unroll over blockSize2,
00301    * srcBLen should be greater than or equal to 4 */
00302   if (srcBLen >= 4U)
00303   {
00304     /* Loop unroll over blockSize2, by 4 */
00305     blkCnt = blockSize2 >> 2U;
00306 
00307     while (blkCnt > 0U)
00308     {
00309       py = py - 1U;
00310 
00311       /* Set all accumulators to zero */
00312       acc0 = 0;
00313       acc1 = 0;
00314       acc2 = 0;
00315       acc3 = 0;
00316 
00317 
00318       /* read x[0], x[1] samples */
00319       x0 = *__SIMD32(px);
00320       /* read x[1], x[2] samples */
00321       x1 = _SIMD32_OFFSET(px+1);
00322       px+= 2U;
00323 
00324 
00325       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00326       k = srcBLen >> 2U;
00327 
00328       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00329        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00330       do
00331       {
00332         /* Read the last two inputB samples using SIMD:
00333          * y[srcBLen - 1] and y[srcBLen - 2] */
00334         c0 = *__SIMD32(py)--;
00335 
00336         /* acc0 +=  x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
00337         acc0 = __SMLALDX(x0, c0, acc0);
00338 
00339         /* acc1 +=  x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
00340         acc1 = __SMLALDX(x1, c0, acc1);
00341 
00342         /* Read x[2], x[3] */
00343         x2 = *__SIMD32(px);
00344 
00345         /* Read x[3], x[4] */
00346         x3 = _SIMD32_OFFSET(px+1);
00347 
00348         /* acc2 +=  x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
00349         acc2 = __SMLALDX(x2, c0, acc2);
00350 
00351         /* acc3 +=  x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
00352         acc3 = __SMLALDX(x3, c0, acc3);
00353 
00354         /* Read y[srcBLen - 3] and y[srcBLen - 4] */
00355         c0 = *__SIMD32(py)--;
00356 
00357         /* acc0 +=  x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
00358         acc0 = __SMLALDX(x2, c0, acc0);
00359 
00360         /* acc1 +=  x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
00361         acc1 = __SMLALDX(x3, c0, acc1);
00362 
00363         /* Read x[4], x[5] */
00364         x0 = _SIMD32_OFFSET(px+2);
00365 
00366         /* Read x[5], x[6] */
00367         x1 = _SIMD32_OFFSET(px+3);
00368         px += 4U;
00369 
00370         /* acc2 +=  x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
00371         acc2 = __SMLALDX(x0, c0, acc2);
00372 
00373         /* acc3 +=  x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
00374         acc3 = __SMLALDX(x1, c0, acc3);
00375 
00376       } while (--k);
00377 
00378       /* For the next MAC operations, SIMD is not used
00379        * So, the 16 bit pointer if inputB, py is updated */
00380 
00381       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
00382        ** No loop unrolling is used. */
00383       k = srcBLen % 0x4U;
00384 
00385       if (k == 1U)
00386       {
00387         /* Read y[srcBLen - 5] */
00388         c0 = *(py+1);
00389 
00390 #ifdef  ARM_MATH_BIG_ENDIAN
00391 
00392         c0 = c0 << 16U;
00393 
00394 #else
00395 
00396         c0 = c0 & 0x0000FFFF;
00397 
00398 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
00399 
00400         /* Read x[7] */
00401         x3 = *__SIMD32(px);
00402         px++;
00403 
00404         /* Perform the multiply-accumulates */
00405         acc0 = __SMLALD(x0, c0, acc0);
00406         acc1 = __SMLALD(x1, c0, acc1);
00407         acc2 = __SMLALDX(x1, c0, acc2);
00408         acc3 = __SMLALDX(x3, c0, acc3);
00409       }
00410 
00411       if (k == 2U)
00412       {
00413         /* Read y[srcBLen - 5], y[srcBLen - 6] */
00414         c0 = _SIMD32_OFFSET(py);
00415 
00416         /* Read x[7], x[8] */
00417         x3 = *__SIMD32(px);
00418 
00419         /* Read x[9] */
00420         x2 = _SIMD32_OFFSET(px+1);
00421         px += 2U;
00422 
00423         /* Perform the multiply-accumulates */
00424         acc0 = __SMLALDX(x0, c0, acc0);
00425         acc1 = __SMLALDX(x1, c0, acc1);
00426         acc2 = __SMLALDX(x3, c0, acc2);
00427         acc3 = __SMLALDX(x2, c0, acc3);
00428       }
00429 
00430       if (k == 3U)
00431       {
00432         /* Read y[srcBLen - 5], y[srcBLen - 6] */
00433         c0 = _SIMD32_OFFSET(py);
00434 
00435         /* Read x[7], x[8] */
00436         x3 = *__SIMD32(px);
00437 
00438         /* Read x[9] */
00439         x2 = _SIMD32_OFFSET(px+1);
00440 
00441         /* Perform the multiply-accumulates */
00442         acc0 = __SMLALDX(x0, c0, acc0);
00443         acc1 = __SMLALDX(x1, c0, acc1);
00444         acc2 = __SMLALDX(x3, c0, acc2);
00445         acc3 = __SMLALDX(x2, c0, acc3);
00446 
00447         c0 = *(py-1);
00448 
00449 #ifdef  ARM_MATH_BIG_ENDIAN
00450 
00451         c0 = c0 << 16U;
00452 #else
00453 
00454         c0 = c0 & 0x0000FFFF;
00455 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
00456 
00457         /* Read x[10] */
00458         x3 =  _SIMD32_OFFSET(px+2);
00459         px += 3U;
00460 
00461         /* Perform the multiply-accumulates */
00462         acc0 = __SMLALDX(x1, c0, acc0);
00463         acc1 = __SMLALD(x2, c0, acc1);
00464         acc2 = __SMLALDX(x2, c0, acc2);
00465         acc3 = __SMLALDX(x3, c0, acc3);
00466       }
00467 
00468 
00469       /* Store the results in the accumulators in the destination buffer. */
00470 
00471 #ifndef  ARM_MATH_BIG_ENDIAN
00472 
00473       *__SIMD32(pOut)++ =
00474         __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
00475       *__SIMD32(pOut)++ =
00476         __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
00477 
00478 #else
00479 
00480       *__SIMD32(pOut)++ =
00481         __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
00482       *__SIMD32(pOut)++ =
00483         __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
00484 
00485 #endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */
00486 
00487       /* Increment the pointer pIn1 index, count by 4 */
00488       count += 4U;
00489 
00490       /* Update the inputA and inputB pointers for next MAC calculation */
00491       if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
00492       {
00493         px = pIn1 + firstIndex - srcBLen + 1 + count;
00494       }
00495       else
00496       {
00497         px = pIn1 + count;
00498       }
00499       py = pSrc2;
00500 
00501         /* Decrement the loop counter */
00502         blkCnt--;
00503       }
00504 
00505       /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
00506        ** No loop unrolling is used. */
00507       blkCnt = (uint32_t) blockSize2 % 0x4U;
00508 
00509       while (blkCnt > 0U)
00510       {
00511         /* Accumulator is made zero for every iteration */
00512         sum = 0;
00513 
00514         /* Apply loop unrolling and compute 4 MACs simultaneously. */
00515         k = srcBLen >> 2U;
00516 
00517         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00518          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00519         while (k > 0U)
00520         {
00521           /* Perform the multiply-accumulates */
00522           sum += (q63_t) ((q31_t) * px++ * *py--);
00523           sum += (q63_t) ((q31_t) * px++ * *py--);
00524           sum += (q63_t) ((q31_t) * px++ * *py--);
00525           sum += (q63_t) ((q31_t) * px++ * *py--);
00526 
00527           /* Decrement the loop counter */
00528           k--;
00529         }
00530 
00531         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
00532          ** No loop unrolling is used. */
00533         k = srcBLen % 0x4U;
00534 
00535         while (k > 0U)
00536         {
00537           /* Perform the multiply-accumulates */
00538           sum += (q63_t) ((q31_t) * px++ * *py--);
00539 
00540           /* Decrement the loop counter */
00541           k--;
00542         }
00543 
00544         /* Store the result in the accumulator in the destination buffer. */
00545         *pOut++ = (q15_t) (__SSAT(sum >> 15, 16));
00546 
00547         /* Increment the pointer pIn1 index, count by 1 */
00548         count++;
00549 
00550         /* Update the inputA and inputB pointers for next MAC calculation */
00551         if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
00552         {
00553           px = pIn1 + firstIndex - srcBLen + 1 + count;
00554         }
00555         else
00556         {
00557           px = pIn1 + count;
00558         }
00559         py = pSrc2;
00560 
00561         /* Decrement the loop counter */
00562         blkCnt--;
00563       }
00564     }
00565     else
00566     {
00567       /* If the srcBLen is not a multiple of 4,
00568        * the blockSize2 loop cannot be unrolled by 4 */
00569       blkCnt = (uint32_t) blockSize2;
00570 
00571       while (blkCnt > 0U)
00572       {
00573         /* Accumulator is made zero for every iteration */
00574         sum = 0;
00575 
00576         /* srcBLen number of MACS should be performed */
00577         k = srcBLen;
00578 
00579         while (k > 0U)
00580         {
00581           /* Perform the multiply-accumulate */
00582           sum += (q63_t) ((q31_t) * px++ * *py--);
00583 
00584           /* Decrement the loop counter */
00585           k--;
00586         }
00587 
00588         /* Store the result in the accumulator in the destination buffer. */
00589         *pOut++ = (q15_t) (__SSAT(sum >> 15, 16));
00590 
00591         /* Increment the MAC count */
00592         count++;
00593 
00594         /* Update the inputA and inputB pointers for next MAC calculation */
00595         if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
00596         {
00597           px = pIn1 + firstIndex - srcBLen + 1 + count;
00598         }
00599         else
00600         {
00601           px = pIn1 + count;
00602         }
00603         py = pSrc2;
00604 
00605         /* Decrement the loop counter */
00606         blkCnt--;
00607       }
00608     }
00609 
00610 
00611     /* --------------------------
00612      * Initializations of stage3
00613      * -------------------------*/
00614 
00615     /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
00616      * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
00617      * ....
00618      * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
00619      * sum +=  x[srcALen-1] * y[srcBLen-1]
00620      */
00621 
00622     /* In this stage the MAC operations are decreased by 1 for every iteration.
00623        The count variable holds the number of MAC operations performed */
00624     count = srcBLen - 1U;
00625 
00626     /* Working pointer of inputA */
00627     pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
00628     px = pSrc1;
00629 
00630     /* Working pointer of inputB */
00631     pSrc2 = pIn2 + (srcBLen - 1U);
00632     pIn2 = pSrc2 - 1U;
00633     py = pIn2;
00634 
00635     /* -------------------
00636      * Stage3 process
00637      * ------------------*/
00638 
00639     /* For loop unrolling by 4, this stage is divided into two. */
00640     /* First part of this stage computes the MAC operations greater than 4 */
00641     /* Second part of this stage computes the MAC operations less than or equal to 4 */
00642 
00643     /* The first part of the stage starts here */
00644     j = count >> 2U;
00645 
00646     while ((j > 0U) && (blockSize3 > 0))
00647     {
00648       /* Accumulator is made zero for every iteration */
00649       sum = 0;
00650 
00651       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00652       k = count >> 2U;
00653 
00654       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00655        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00656       while (k > 0U)
00657       {
00658         /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied
00659          * with y[srcBLen - 1], y[srcBLen - 2] respectively */
00660         sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
00661         /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied
00662          * with y[srcBLen - 3], y[srcBLen - 4] respectively */
00663         sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
00664 
00665         /* Decrement the loop counter */
00666         k--;
00667       }
00668 
00669       /* For the next MAC operations, the pointer py is used without SIMD
00670        * So, py is incremented by 1 */
00671       py = py + 1U;
00672 
00673       /* If the count is not a multiple of 4, compute any remaining MACs here.
00674        ** No loop unrolling is used. */
00675       k = count % 0x4U;
00676 
00677       while (k > 0U)
00678       {
00679         /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
00680         sum = __SMLALD(*px++, *py--, sum);
00681 
00682         /* Decrement the loop counter */
00683         k--;
00684       }
00685 
00686       /* Store the result in the accumulator in the destination buffer. */
00687       *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
00688 
00689       /* Update the inputA and inputB pointers for next MAC calculation */
00690       px = ++pSrc1;
00691       py = pIn2;
00692 
00693       /* Decrement the MAC count */
00694       count--;
00695 
00696       /* Decrement the loop counter */
00697       blockSize3--;
00698 
00699       j--;
00700     }
00701 
00702     /* The second part of the stage starts here */
00703     /* SIMD is not used for the next MAC operations,
00704      * so pointer py is updated to read only one sample at a time */
00705     py = py + 1U;
00706 
00707     while (blockSize3 > 0)
00708     {
00709       /* Accumulator is made zero for every iteration */
00710       sum = 0;
00711 
00712       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00713       k = count;
00714 
00715       while (k > 0U)
00716       {
00717         /* Perform the multiply-accumulates */
00718         /* sum +=  x[srcALen-1] * y[srcBLen-1] */
00719         sum = __SMLALD(*px++, *py--, sum);
00720 
00721         /* Decrement the loop counter */
00722         k--;
00723       }
00724 
00725       /* Store the result in the accumulator in the destination buffer. */
00726       *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
00727 
00728       /* Update the inputA and inputB pointers for next MAC calculation */
00729       px = ++pSrc1;
00730       py = pSrc2;
00731 
00732       /* Decrement the MAC count */
00733       count--;
00734 
00735       /* Decrement the loop counter */
00736       blockSize3--;
00737     }
00738 
00739     /* set status as ARM_MATH_SUCCESS */
00740     status = ARM_MATH_SUCCESS;
00741   }
00742 
00743   /* Return to application */
00744   return (status);
00745 
00746 #else
00747 
00748   /* Run the below code for Cortex-M0 */
00749 
00750   q15_t *pIn1 = pSrcA;                           /* inputA pointer */
00751   q15_t *pIn2 = pSrcB;                           /* inputB pointer */
00752   q63_t sum;                                     /* Accumulator */
00753   uint32_t i, j;                                 /* loop counters */
00754   arm_status status;                             /* status of Partial convolution */
00755 
00756   /* Check for range of output samples to be calculated */
00757   if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
00758   {
00759     /* Set status as ARM_ARGUMENT_ERROR */
00760     status = ARM_MATH_ARGUMENT_ERROR;
00761   }
00762   else
00763   {
00764     /* Loop to calculate convolution for output length number of values */
00765     for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++)
00766     {
00767       /* Initialize sum with zero to carry on MAC operations */
00768       sum = 0;
00769 
00770       /* Loop to perform MAC operations according to convolution equation */
00771       for (j = 0; j <= i; j++)
00772       {
00773         /* Check the array limitations */
00774         if (((i - j) < srcBLen) && (j < srcALen))
00775         {
00776           /* z[i] += x[i-j] * y[j] */
00777           sum += ((q31_t) pIn1[j] * (pIn2[i - j]));
00778         }
00779       }
00780 
00781       /* Store the output in the destination buffer */
00782       pDst[i] = (q15_t) __SSAT((sum >> 15U), 16U);
00783     }
00784     /* set status as ARM_SUCCESS as there are no argument errors */
00785     status = ARM_MATH_SUCCESS;
00786   }
00787   return (status);
00788 
00789 #endif /* #if (defined(ARM_MATH_CM7) || defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE) */
00790 
00791 }
00792 
00793 /**
00794  * @} end of PartialConv group
00795  */
00796