Robert Lopez / CMSIS5
Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_conv_partial_q7.c Source File

arm_conv_partial_q7.c

00001 /* ----------------------------------------------------------------------
00002  * Project:      CMSIS DSP Library
00003  * Title:        arm_conv_partial_q7.c
00004  * Description:  Partial convolution of Q7 sequences
00005  *
00006  * $Date:        27. January 2017
00007  * $Revision:    V.1.5.1
00008  *
00009  * Target Processor: Cortex-M cores
00010  * -------------------------------------------------------------------- */
00011 /*
00012  * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
00013  *
00014  * SPDX-License-Identifier: Apache-2.0
00015  *
00016  * Licensed under the Apache License, Version 2.0 (the License); you may
00017  * not use this file except in compliance with the License.
00018  * You may obtain a copy of the License at
00019  *
00020  * www.apache.org/licenses/LICENSE-2.0
00021  *
00022  * Unless required by applicable law or agreed to in writing, software
00023  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
00024  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00025  * See the License for the specific language governing permissions and
00026  * limitations under the License.
00027  */
00028 
00029 #include "arm_math.h"
00030 
00031 /**
00032  * @ingroup groupFilters
00033  */
00034 
00035 /**
00036  * @addtogroup PartialConv
00037  * @{
00038  */
00039 
00040 /**
00041  * @brief Partial convolution of Q7 sequences.
00042  * @param[in]       *pSrcA points to the first input sequence.
00043  * @param[in]       srcALen length of the first input sequence.
00044  * @param[in]       *pSrcB points to the second input sequence.
00045  * @param[in]       srcBLen length of the second input sequence.
00046  * @param[out]      *pDst points to the location where the output result is written.
00047  * @param[in]       firstIndex is the first output sample to start with.
00048  * @param[in]       numPoints is the number of output points to be computed.
00049  * @return  Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
00050  *
00051  * \par
00052  * Refer the function <code>arm_conv_partial_opt_q7()</code> for a faster implementation of this function.
00053  *
00054  */
00055 
00056 arm_status arm_conv_partial_q7(
00057   q7_t * pSrcA,
00058   uint32_t srcALen,
00059   q7_t * pSrcB,
00060   uint32_t srcBLen,
00061   q7_t * pDst,
00062   uint32_t firstIndex,
00063   uint32_t numPoints)
00064 {
00065 
00066 
00067 #if defined (ARM_MATH_DSP)
00068 
00069   /* Run the below code for Cortex-M4 and Cortex-M3 */
00070 
00071   q7_t *pIn1;                                    /* inputA pointer */
00072   q7_t *pIn2;                                    /* inputB pointer */
00073   q7_t *pOut = pDst;                             /* output pointer */
00074   q7_t *px;                                      /* Intermediate inputA pointer */
00075   q7_t *py;                                      /* Intermediate inputB pointer */
00076   q7_t *pSrc1, *pSrc2;                           /* Intermediate pointers */
00077   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulator */
00078   q31_t input1, input2;
00079   q15_t in1, in2;
00080   q7_t x0, x1, x2, x3, c0, c1;
00081   uint32_t j, k, count, check, blkCnt;
00082   int32_t blockSize1, blockSize2, blockSize3;    /* loop counter */
00083   arm_status status;
00084 
00085 
00086   /* Check for range of output samples to be calculated */
00087   if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
00088   {
00089     /* Set status as ARM_MATH_ARGUMENT_ERROR */
00090     status = ARM_MATH_ARGUMENT_ERROR;
00091   }
00092   else
00093   {
00094 
00095     /* The algorithm implementation is based on the lengths of the inputs. */
00096     /* srcB is always made to slide across srcA. */
00097     /* So srcBLen is always considered as shorter or equal to srcALen */
00098     if (srcALen >= srcBLen)
00099     {
00100       /* Initialization of inputA pointer */
00101       pIn1 = pSrcA;
00102 
00103       /* Initialization of inputB pointer */
00104       pIn2 = pSrcB;
00105     }
00106     else
00107     {
00108       /* Initialization of inputA pointer */
00109       pIn1 = pSrcB;
00110 
00111       /* Initialization of inputB pointer */
00112       pIn2 = pSrcA;
00113 
00114       /* srcBLen is always considered as shorter or equal to srcALen */
00115       j = srcBLen;
00116       srcBLen = srcALen;
00117       srcALen = j;
00118     }
00119 
00120     /* Conditions to check which loopCounter holds
00121      * the first and last indices of the output samples to be calculated. */
00122     check = firstIndex + numPoints;
00123     blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
00124     blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
00125     blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);
00126     blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 :
00127                                      (int32_t) numPoints) : 0;
00128     blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +
00129                                     (int32_t) firstIndex);
00130     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
00131 
00132     /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
00133     /* The function is internally
00134      * divided into three stages according to the number of multiplications that has to be
00135      * taken place between inputA samples and inputB samples. In the first stage of the
00136      * algorithm, the multiplications increase by one for every iteration.
00137      * In the second stage of the algorithm, srcBLen number of multiplications are done.
00138      * In the third stage of the algorithm, the multiplications decrease by one
00139      * for every iteration. */
00140 
00141     /* Set the output pointer to point to the firstIndex
00142      * of the output sample to be calculated. */
00143     pOut = pDst + firstIndex;
00144 
00145     /* --------------------------
00146      * Initializations of stage1
00147      * -------------------------*/
00148 
00149     /* sum = x[0] * y[0]
00150      * sum = x[0] * y[1] + x[1] * y[0]
00151      * ....
00152      * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
00153      */
00154 
00155     /* In this stage the MAC operations are increased by 1 for every iteration.
00156        The count variable holds the number of MAC operations performed.
00157        Since the partial convolution starts from from firstIndex
00158        Number of Macs to be performed is firstIndex + 1 */
00159     count = 1U + firstIndex;
00160 
00161     /* Working pointer of inputA */
00162     px = pIn1;
00163 
00164     /* Working pointer of inputB */
00165     pSrc2 = pIn2 + firstIndex;
00166     py = pSrc2;
00167 
00168     /* ------------------------
00169      * Stage1 process
00170      * ----------------------*/
00171 
00172     /* The first stage starts here */
00173     while (blockSize1 > 0)
00174     {
00175       /* Accumulator is made zero for every iteration */
00176       sum = 0;
00177 
00178       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00179       k = count >> 2U;
00180 
00181       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00182        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00183       while (k > 0U)
00184       {
00185         /* x[0] , x[1] */
00186         in1 = (q15_t) * px++;
00187         in2 = (q15_t) * px++;
00188         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00189 
00190         /* y[srcBLen - 1] , y[srcBLen - 2] */
00191         in1 = (q15_t) * py--;
00192         in2 = (q15_t) * py--;
00193         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00194 
00195         /* x[0] * y[srcBLen - 1] */
00196         /* x[1] * y[srcBLen - 2] */
00197         sum = __SMLAD(input1, input2, sum);
00198 
00199         /* x[2] , x[3] */
00200         in1 = (q15_t) * px++;
00201         in2 = (q15_t) * px++;
00202         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00203 
00204         /* y[srcBLen - 3] , y[srcBLen - 4] */
00205         in1 = (q15_t) * py--;
00206         in2 = (q15_t) * py--;
00207         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00208 
00209         /* x[2] * y[srcBLen - 3] */
00210         /* x[3] * y[srcBLen - 4] */
00211         sum = __SMLAD(input1, input2, sum);
00212 
00213         /* Decrement the loop counter */
00214         k--;
00215       }
00216 
00217       /* If the count is not a multiple of 4, compute any remaining MACs here.
00218        ** No loop unrolling is used. */
00219       k = count % 0x4U;
00220 
00221       while (k > 0U)
00222       {
00223         /* Perform the multiply-accumulates */
00224         sum += ((q31_t) * px++ * *py--);
00225 
00226         /* Decrement the loop counter */
00227         k--;
00228       }
00229 
00230       /* Store the result in the accumulator in the destination buffer. */
00231       *pOut++ = (q7_t) (__SSAT(sum >> 7, 8));
00232 
00233       /* Update the inputA and inputB pointers for next MAC calculation */
00234       py = ++pSrc2;
00235       px = pIn1;
00236 
00237       /* Increment the MAC count */
00238       count++;
00239 
00240       /* Decrement the loop counter */
00241       blockSize1--;
00242     }
00243 
00244     /* --------------------------
00245      * Initializations of stage2
00246      * ------------------------*/
00247 
00248     /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
00249      * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
00250      * ....
00251      * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
00252      */
00253 
00254     /* Working pointer of inputA */
00255     if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
00256     {
00257       px = pIn1 + firstIndex - srcBLen + 1;
00258     }
00259     else
00260     {
00261       px = pIn1;
00262     }
00263 
00264     /* Working pointer of inputB */
00265     pSrc2 = pIn2 + (srcBLen - 1U);
00266     py = pSrc2;
00267 
00268     /* count is index by which the pointer pIn1 to be incremented */
00269     count = 0U;
00270 
00271     /* -------------------
00272      * Stage2 process
00273      * ------------------*/
00274 
00275     /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
00276      * So, to loop unroll over blockSize2,
00277      * srcBLen should be greater than or equal to 4 */
00278     if (srcBLen >= 4U)
00279     {
00280       /* Loop unroll over blockSize2, by 4 */
00281       blkCnt = ((uint32_t) blockSize2 >> 2U);
00282 
00283       while (blkCnt > 0U)
00284       {
00285         /* Set all accumulators to zero */
00286         acc0 = 0;
00287         acc1 = 0;
00288         acc2 = 0;
00289         acc3 = 0;
00290 
00291         /* read x[0], x[1], x[2] samples */
00292         x0 = *(px++);
00293         x1 = *(px++);
00294         x2 = *(px++);
00295 
00296         /* Apply loop unrolling and compute 4 MACs simultaneously. */
00297         k = srcBLen >> 2U;
00298 
00299         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00300          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00301         do
00302         {
00303           /* Read y[srcBLen - 1] sample */
00304           c0 = *(py--);
00305           /* Read y[srcBLen - 2] sample */
00306           c1 = *(py--);
00307 
00308           /* Read x[3] sample */
00309           x3 = *(px++);
00310 
00311           /* x[0] and x[1] are packed */
00312           in1 = (q15_t) x0;
00313           in2 = (q15_t) x1;
00314 
00315           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00316 
00317           /* y[srcBLen - 1]   and y[srcBLen - 2] are packed */
00318           in1 = (q15_t) c0;
00319           in2 = (q15_t) c1;
00320 
00321           input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00322 
00323           /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2]  */
00324           acc0 = __SMLAD(input1, input2, acc0);
00325 
00326           /* x[1] and x[2] are packed */
00327           in1 = (q15_t) x1;
00328           in2 = (q15_t) x2;
00329 
00330           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00331 
00332           /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2]  */
00333           acc1 = __SMLAD(input1, input2, acc1);
00334 
00335           /* x[2] and x[3] are packed */
00336           in1 = (q15_t) x2;
00337           in2 = (q15_t) x3;
00338 
00339           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00340 
00341           /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2]  */
00342           acc2 = __SMLAD(input1, input2, acc2);
00343 
00344           /* Read x[4] sample */
00345           x0 = *(px++);
00346 
00347           /* x[3] and x[4] are packed */
00348           in1 = (q15_t) x3;
00349           in2 = (q15_t) x0;
00350 
00351           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00352 
00353           /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2]  */
00354           acc3 = __SMLAD(input1, input2, acc3);
00355 
00356           /* Read y[srcBLen - 3] sample */
00357           c0 = *(py--);
00358           /* Read y[srcBLen - 4] sample */
00359           c1 = *(py--);
00360 
00361           /* Read x[5] sample */
00362           x1 = *(px++);
00363 
00364           /* x[2] and x[3] are packed */
00365           in1 = (q15_t) x2;
00366           in2 = (q15_t) x3;
00367 
00368           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00369 
00370           /* y[srcBLen - 3] and y[srcBLen - 4] are packed */
00371           in1 = (q15_t) c0;
00372           in2 = (q15_t) c1;
00373 
00374           input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00375 
00376           /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4]  */
00377           acc0 = __SMLAD(input1, input2, acc0);
00378 
00379           /* x[3] and x[4] are packed */
00380           in1 = (q15_t) x3;
00381           in2 = (q15_t) x0;
00382 
00383           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00384 
00385           /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4]  */
00386           acc1 = __SMLAD(input1, input2, acc1);
00387 
00388           /* x[4] and x[5] are packed */
00389           in1 = (q15_t) x0;
00390           in2 = (q15_t) x1;
00391 
00392           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00393 
00394           /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4]  */
00395           acc2 = __SMLAD(input1, input2, acc2);
00396 
00397           /* Read x[6] sample */
00398           x2 = *(px++);
00399 
00400           /* x[5] and x[6] are packed */
00401           in1 = (q15_t) x1;
00402           in2 = (q15_t) x2;
00403 
00404           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00405 
00406           /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4]  */
00407           acc3 = __SMLAD(input1, input2, acc3);
00408 
00409         } while (--k);
00410 
00411         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
00412          ** No loop unrolling is used. */
00413         k = srcBLen % 0x4U;
00414 
00415         while (k > 0U)
00416         {
00417           /* Read y[srcBLen - 5] sample */
00418           c0 = *(py--);
00419 
00420           /* Read x[7] sample */
00421           x3 = *(px++);
00422 
00423           /* Perform the multiply-accumulates */
00424           /* acc0 +=  x[4] * y[srcBLen - 5] */
00425           acc0 += ((q31_t) x0 * c0);
00426           /* acc1 +=  x[5] * y[srcBLen - 5] */
00427           acc1 += ((q31_t) x1 * c0);
00428           /* acc2 +=  x[6] * y[srcBLen - 5] */
00429           acc2 += ((q31_t) x2 * c0);
00430           /* acc3 +=  x[7] * y[srcBLen - 5] */
00431           acc3 += ((q31_t) x3 * c0);
00432 
00433           /* Reuse the present samples for the next MAC */
00434           x0 = x1;
00435           x1 = x2;
00436           x2 = x3;
00437 
00438           /* Decrement the loop counter */
00439           k--;
00440         }
00441 
00442         /* Store the result in the accumulator in the destination buffer. */
00443         *pOut++ = (q7_t) (__SSAT(acc0 >> 7, 8));
00444         *pOut++ = (q7_t) (__SSAT(acc1 >> 7, 8));
00445         *pOut++ = (q7_t) (__SSAT(acc2 >> 7, 8));
00446         *pOut++ = (q7_t) (__SSAT(acc3 >> 7, 8));
00447 
00448         /* Increment the pointer pIn1 index, count by 4 */
00449         count += 4U;
00450 
00451         /* Update the inputA and inputB pointers for next MAC calculation */
00452         if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
00453         {
00454           px = pIn1 + firstIndex - srcBLen + 1 + count;
00455         }
00456         else
00457         {
00458           px = pIn1 + count;
00459         }
00460         py = pSrc2;
00461 
00462 
00463         /* Decrement the loop counter */
00464         blkCnt--;
00465       }
00466 
00467       /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
00468        ** No loop unrolling is used. */
00469       blkCnt = (uint32_t) blockSize2 % 0x4U;
00470 
00471       while (blkCnt > 0U)
00472       {
00473         /* Accumulator is made zero for every iteration */
00474         sum = 0;
00475 
00476         /* Apply loop unrolling and compute 4 MACs simultaneously. */
00477         k = srcBLen >> 2U;
00478 
00479         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00480          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00481         while (k > 0U)
00482         {
00483 
00484           /* Reading two inputs of SrcA buffer and packing */
00485           in1 = (q15_t) * px++;
00486           in2 = (q15_t) * px++;
00487           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00488 
00489           /* Reading two inputs of SrcB buffer and packing */
00490           in1 = (q15_t) * py--;
00491           in2 = (q15_t) * py--;
00492           input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00493 
00494           /* Perform the multiply-accumulates */
00495           sum = __SMLAD(input1, input2, sum);
00496 
00497           /* Reading two inputs of SrcA buffer and packing */
00498           in1 = (q15_t) * px++;
00499           in2 = (q15_t) * px++;
00500           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00501 
00502           /* Reading two inputs of SrcB buffer and packing */
00503           in1 = (q15_t) * py--;
00504           in2 = (q15_t) * py--;
00505           input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00506 
00507           /* Perform the multiply-accumulates */
00508           sum = __SMLAD(input1, input2, sum);
00509 
00510           /* Decrement the loop counter */
00511           k--;
00512         }
00513 
00514         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
00515          ** No loop unrolling is used. */
00516         k = srcBLen % 0x4U;
00517 
00518         while (k > 0U)
00519         {
00520           /* Perform the multiply-accumulates */
00521           sum += ((q31_t) * px++ * *py--);
00522 
00523           /* Decrement the loop counter */
00524           k--;
00525         }
00526 
00527         /* Store the result in the accumulator in the destination buffer. */
00528         *pOut++ = (q7_t) (__SSAT(sum >> 7, 8));
00529 
00530         /* Increment the pointer pIn1 index, count by 1 */
00531         count++;
00532 
00533         /* Update the inputA and inputB pointers for next MAC calculation */
00534         if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
00535         {
00536           px = pIn1 + firstIndex - srcBLen + 1 + count;
00537         }
00538         else
00539         {
00540           px = pIn1 + count;
00541         }
00542         py = pSrc2;
00543 
00544         /* Decrement the loop counter */
00545         blkCnt--;
00546       }
00547     }
00548     else
00549     {
00550       /* If the srcBLen is not a multiple of 4,
00551        * the blockSize2 loop cannot be unrolled by 4 */
00552       blkCnt = (uint32_t) blockSize2;
00553 
00554       while (blkCnt > 0U)
00555       {
00556         /* Accumulator is made zero for every iteration */
00557         sum = 0;
00558 
00559         /* srcBLen number of MACS should be performed */
00560         k = srcBLen;
00561 
00562         while (k > 0U)
00563         {
00564           /* Perform the multiply-accumulate */
00565           sum += ((q31_t) * px++ * *py--);
00566 
00567           /* Decrement the loop counter */
00568           k--;
00569         }
00570 
00571         /* Store the result in the accumulator in the destination buffer. */
00572         *pOut++ = (q7_t) (__SSAT(sum >> 7, 8));
00573 
00574         /* Increment the MAC count */
00575         count++;
00576 
00577         /* Update the inputA and inputB pointers for next MAC calculation */
00578         if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
00579         {
00580           px = pIn1 + firstIndex - srcBLen + 1 + count;
00581         }
00582         else
00583         {
00584           px = pIn1 + count;
00585         }
00586         py = pSrc2;
00587 
00588         /* Decrement the loop counter */
00589         blkCnt--;
00590       }
00591     }
00592 
00593 
00594     /* --------------------------
00595      * Initializations of stage3
00596      * -------------------------*/
00597 
00598     /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
00599      * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
00600      * ....
00601      * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
00602      * sum +=  x[srcALen-1] * y[srcBLen-1]
00603      */
00604 
00605     /* In this stage the MAC operations are decreased by 1 for every iteration.
00606        The count variable holds the number of MAC operations performed */
00607     count = srcBLen - 1U;
00608 
00609     /* Working pointer of inputA */
00610     pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
00611     px = pSrc1;
00612 
00613     /* Working pointer of inputB */
00614     pSrc2 = pIn2 + (srcBLen - 1U);
00615     py = pSrc2;
00616 
00617     /* -------------------
00618      * Stage3 process
00619      * ------------------*/
00620 
00621     while (blockSize3 > 0)
00622     {
00623       /* Accumulator is made zero for every iteration */
00624       sum = 0;
00625 
00626       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00627       k = count >> 2U;
00628 
00629       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00630        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00631       while (k > 0U)
00632       {
00633         /* Reading two inputs, x[srcALen - srcBLen + 1] and x[srcALen - srcBLen + 2] of SrcA buffer and packing */
00634         in1 = (q15_t) * px++;
00635         in2 = (q15_t) * px++;
00636         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00637 
00638         /* Reading two inputs, y[srcBLen - 1] and y[srcBLen - 2] of SrcB buffer and packing */
00639         in1 = (q15_t) * py--;
00640         in2 = (q15_t) * py--;
00641         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00642 
00643         /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
00644         /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
00645         sum = __SMLAD(input1, input2, sum);
00646 
00647         /* Reading two inputs, x[srcALen - srcBLen + 3] and x[srcALen - srcBLen + 4] of SrcA buffer and packing */
00648         in1 = (q15_t) * px++;
00649         in2 = (q15_t) * px++;
00650         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00651 
00652         /* Reading two inputs, y[srcBLen - 3] and y[srcBLen - 4] of SrcB buffer and packing */
00653         in1 = (q15_t) * py--;
00654         in2 = (q15_t) * py--;
00655         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00656 
00657         /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
00658         /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
00659         sum = __SMLAD(input1, input2, sum);
00660 
00661         /* Decrement the loop counter */
00662         k--;
00663       }
00664 
00665       /* If the count is not a multiple of 4, compute any remaining MACs here.
00666        ** No loop unrolling is used. */
00667       k = count % 0x4U;
00668 
00669       while (k > 0U)
00670       {
00671         /* Perform the multiply-accumulates */
00672         /* sum +=  x[srcALen-1] * y[srcBLen-1] */
00673         sum += ((q31_t) * px++ * *py--);
00674 
00675         /* Decrement the loop counter */
00676         k--;
00677       }
00678 
00679       /* Store the result in the accumulator in the destination buffer. */
00680       *pOut++ = (q7_t) (__SSAT(sum >> 7, 8));
00681 
00682       /* Update the inputA and inputB pointers for next MAC calculation */
00683       px = ++pSrc1;
00684       py = pSrc2;
00685 
00686       /* Decrement the MAC count */
00687       count--;
00688 
00689       /* Decrement the loop counter */
00690       blockSize3--;
00691 
00692     }
00693 
00694     /* set status as ARM_MATH_SUCCESS */
00695     status = ARM_MATH_SUCCESS;
00696   }
00697 
00698   /* Return to application */
00699   return (status);
00700 
00701 #else
00702 
00703   /* Run the below code for Cortex-M0 */
00704 
00705   q7_t *pIn1 = pSrcA;                            /* inputA pointer */
00706   q7_t *pIn2 = pSrcB;                            /* inputB pointer */
00707   q31_t sum;                                     /* Accumulator */
00708   uint32_t i, j;                                 /* loop counters */
00709   arm_status status;                             /* status of Partial convolution */
00710 
00711   /* Check for range of output samples to be calculated */
00712   if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
00713   {
00714     /* Set status as ARM_ARGUMENT_ERROR */
00715     status = ARM_MATH_ARGUMENT_ERROR;
00716   }
00717   else
00718   {
00719     /* Loop to calculate convolution for output length number of values */
00720     for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++)
00721     {
00722       /* Initialize sum with zero to carry on MAC operations */
00723       sum = 0;
00724 
00725       /* Loop to perform MAC operations according to convolution equation */
00726       for (j = 0; j <= i; j++)
00727       {
00728         /* Check the array limitations */
00729         if (((i - j) < srcBLen) && (j < srcALen))
00730         {
00731           /* z[i] += x[i-j] * y[j] */
00732           sum += ((q15_t) pIn1[j] * (pIn2[i - j]));
00733         }
00734       }
00735 
00736       /* Store the output in the destination buffer */
00737       pDst[i] = (q7_t) __SSAT((sum >> 7U), 8U);
00738     }
00739     /* set status as ARM_SUCCESS as there are no argument errors */
00740     status = ARM_MATH_SUCCESS;
00741   }
00742   return (status);
00743 
00744 #endif /*  #if defined (ARM_MATH_DSP) */
00745 
00746 }
00747 
00748 /**
00749  * @} end of PartialConv group
00750  */
00751