Robert Lopez / CMSIS5
Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_conv_partial_f32.c Source File

arm_conv_partial_f32.c

00001 /* ----------------------------------------------------------------------
00002  * Project:      CMSIS DSP Library
00003  * Title:        arm_conv_partial_f32.c
00004  * Description:  Partial convolution of floating-point sequences
00005  *
00006  * $Date:        27. January 2017
00007  * $Revision:    V.1.5.1
00008  *
00009  * Target Processor: Cortex-M cores
00010  * -------------------------------------------------------------------- */
00011 /*
00012  * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
00013  *
00014  * SPDX-License-Identifier: Apache-2.0
00015  *
00016  * Licensed under the Apache License, Version 2.0 (the License); you may
00017  * not use this file except in compliance with the License.
00018  * You may obtain a copy of the License at
00019  *
00020  * www.apache.org/licenses/LICENSE-2.0
00021  *
00022  * Unless required by applicable law or agreed to in writing, software
00023  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
00024  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00025  * See the License for the specific language governing permissions and
00026  * limitations under the License.
00027  */
00028 
00029 #include "arm_math.h"
00030 
00031 /**
00032  * @ingroup groupFilters
00033  */
00034 
00035 /**
00036  * @defgroup PartialConv Partial Convolution
00037  *
00038  * Partial Convolution is equivalent to Convolution except that a subset of the output samples is generated.
00039  * Each function has two additional arguments.
00040  * <code>firstIndex</code> specifies the starting index of the subset of output samples.
00041  * <code>numPoints</code> is the number of output samples to compute.
00042  * The function computes the output in the range
00043  * <code>[firstIndex, ..., firstIndex+numPoints-1]</code>.
00044  * The output array <code>pDst</code> contains <code>numPoints</code> values.
00045  *
00046  * The allowable range of output indices is [0 srcALen+srcBLen-2].
00047  * If the requested subset does not fall in this range then the functions return ARM_MATH_ARGUMENT_ERROR.
00048  * Otherwise the functions return ARM_MATH_SUCCESS.
00049  * \note Refer arm_conv_f32() for details on fixed point behavior.
00050  *
00051  *
00052  * <b>Fast Versions</b>
00053  *
00054  * \par
00055  * Fast versions are supported for Q31 and Q15 of partial convolution.  Cycles for Fast versions are less compared to Q31 and Q15 of partial conv and the design requires
00056  * the input signals should be scaled down to avoid intermediate overflows.
00057  *
00058  *
00059  * <b>Opt Versions</b>
00060  *
00061  * \par
00062  * Opt versions are supported for Q15 and Q7.  Design uses internal scratch buffer for getting good optimisation.
00063  * These versions are optimised in cycles and consumes more memory(Scratch memory) compared to Q15 and Q7 versions of partial convolution
00064  */
00065 
00066 /**
00067  * @addtogroup PartialConv
00068  * @{
00069  */
00070 
00071 /**
00072  * @brief Partial convolution of floating-point sequences.
00073  * @param[in]       *pSrcA points to the first input sequence.
00074  * @param[in]       srcALen length of the first input sequence.
00075  * @param[in]       *pSrcB points to the second input sequence.
00076  * @param[in]       srcBLen length of the second input sequence.
00077  * @param[out]      *pDst points to the location where the output result is written.
00078  * @param[in]       firstIndex is the first output sample to start with.
00079  * @param[in]       numPoints is the number of output points to be computed.
00080  * @return  Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
00081  */
00082 
00083 arm_status arm_conv_partial_f32(
00084   float32_t * pSrcA,
00085   uint32_t srcALen,
00086   float32_t * pSrcB,
00087   uint32_t srcBLen,
00088   float32_t * pDst,
00089   uint32_t firstIndex,
00090   uint32_t numPoints)
00091 {
00092 
00093 
00094 #if defined (ARM_MATH_DSP)
00095 
00096   /* Run the below code for Cortex-M4 and Cortex-M3 */
00097 
00098   float32_t *pIn1 = pSrcA;                       /* inputA pointer */
00099   float32_t *pIn2 = pSrcB;                       /* inputB pointer */
00100   float32_t *pOut = pDst;                        /* output pointer */
00101   float32_t *px;                                 /* Intermediate inputA pointer */
00102   float32_t *py;                                 /* Intermediate inputB pointer */
00103   float32_t *pSrc1, *pSrc2;                      /* Intermediate pointers */
00104   float32_t sum, acc0, acc1, acc2, acc3;         /* Accumulator */
00105   float32_t x0, x1, x2, x3, c0;                  /* Temporary variables to hold state and coefficient values */
00106   uint32_t j, k, count = 0U, blkCnt, check;
00107   int32_t blockSize1, blockSize2, blockSize3;    /* loop counters */
00108   arm_status status;                             /* status of Partial convolution */
00109 
00110 
00111   /* Check for range of output samples to be calculated */
00112   if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
00113   {
00114     /* Set status as ARM_MATH_ARGUMENT_ERROR */
00115     status = ARM_MATH_ARGUMENT_ERROR;
00116   }
00117   else
00118   {
00119 
00120     /* The algorithm implementation is based on the lengths of the inputs. */
00121     /* srcB is always made to slide across srcA. */
00122     /* So srcBLen is always considered as shorter or equal to srcALen */
00123     if (srcALen >= srcBLen)
00124     {
00125       /* Initialization of inputA pointer */
00126       pIn1 = pSrcA;
00127 
00128       /* Initialization of inputB pointer */
00129       pIn2 = pSrcB;
00130     }
00131     else
00132     {
00133       /* Initialization of inputA pointer */
00134       pIn1 = pSrcB;
00135 
00136       /* Initialization of inputB pointer */
00137       pIn2 = pSrcA;
00138 
00139       /* srcBLen is always considered as shorter or equal to srcALen */
00140       j = srcBLen;
00141       srcBLen = srcALen;
00142       srcALen = j;
00143     }
00144 
00145     /* Conditions to check which loopCounter holds
00146      * the first and last indices of the output samples to be calculated. */
00147     check = firstIndex + numPoints;
00148     blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
00149     blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
00150     blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex;
00151     blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 :
00152                                      (int32_t) numPoints) : 0;
00153     blockSize2 = ((int32_t) check - blockSize3) -
00154       (blockSize1 + (int32_t) firstIndex);
00155     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
00156 
00157     /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
00158     /* The function is internally
00159      * divided into three stages according to the number of multiplications that has to be
00160      * taken place between inputA samples and inputB samples. In the first stage of the
00161      * algorithm, the multiplications increase by one for every iteration.
00162      * In the second stage of the algorithm, srcBLen number of multiplications are done.
00163      * In the third stage of the algorithm, the multiplications decrease by one
00164      * for every iteration. */
00165 
00166     /* Set the output pointer to point to the firstIndex
00167      * of the output sample to be calculated. */
00168     pOut = pDst + firstIndex;
00169 
00170     /* --------------------------
00171      * Initializations of stage1
00172      * -------------------------*/
00173 
00174     /* sum = x[0] * y[0]
00175      * sum = x[0] * y[1] + x[1] * y[0]
00176      * ....
00177      * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
00178      */
00179 
00180     /* In this stage the MAC operations are increased by 1 for every iteration.
00181        The count variable holds the number of MAC operations performed.
00182        Since the partial convolution starts from from firstIndex
00183        Number of Macs to be performed is firstIndex + 1 */
00184     count = 1U + firstIndex;
00185 
00186     /* Working pointer of inputA */
00187     px = pIn1;
00188 
00189     /* Working pointer of inputB */
00190     pSrc1 = pIn2 + firstIndex;
00191     py = pSrc1;
00192 
00193     /* ------------------------
00194      * Stage1 process
00195      * ----------------------*/
00196 
00197     /* The first stage starts here */
00198     while (blockSize1 > 0)
00199     {
00200       /* Accumulator is made zero for every iteration */
00201       sum = 0.0f;
00202 
00203       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00204       k = count >> 2U;
00205 
00206       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00207        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00208       while (k > 0U)
00209       {
00210         /* x[0] * y[srcBLen - 1] */
00211         sum += *px++ * *py--;
00212 
00213         /* x[1] * y[srcBLen - 2] */
00214         sum += *px++ * *py--;
00215 
00216         /* x[2] * y[srcBLen - 3] */
00217         sum += *px++ * *py--;
00218 
00219         /* x[3] * y[srcBLen - 4] */
00220         sum += *px++ * *py--;
00221 
00222         /* Decrement the loop counter */
00223         k--;
00224       }
00225 
00226       /* If the count is not a multiple of 4, compute any remaining MACs here.
00227        ** No loop unrolling is used. */
00228       k = count % 0x4U;
00229 
00230       while (k > 0U)
00231       {
00232         /* Perform the multiply-accumulates */
00233         sum += *px++ * *py--;
00234 
00235         /* Decrement the loop counter */
00236         k--;
00237       }
00238 
00239       /* Store the result in the accumulator in the destination buffer. */
00240       *pOut++ = sum;
00241 
00242       /* Update the inputA and inputB pointers for next MAC calculation */
00243       py = ++pSrc1;
00244       px = pIn1;
00245 
00246       /* Increment the MAC count */
00247       count++;
00248 
00249       /* Decrement the loop counter */
00250       blockSize1--;
00251     }
00252 
00253     /* --------------------------
00254      * Initializations of stage2
00255      * ------------------------*/
00256 
00257     /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
00258      * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
00259      * ....
00260      * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
00261      */
00262 
00263     /* Working pointer of inputA */
00264     if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
00265     {
00266       px = pIn1 + firstIndex - srcBLen + 1;
00267     }
00268     else
00269     {
00270       px = pIn1;
00271     }
00272 
00273     /* Working pointer of inputB */
00274     pSrc2 = pIn2 + (srcBLen - 1U);
00275     py = pSrc2;
00276 
00277     /* count is index by which the pointer pIn1 to be incremented */
00278     count = 0U;
00279 
00280     /* -------------------
00281      * Stage2 process
00282      * ------------------*/
00283 
00284     /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
00285      * So, to loop unroll over blockSize2,
00286      * srcBLen should be greater than or equal to 4 */
00287     if (srcBLen >= 4U)
00288     {
00289       /* Loop unroll over blockSize2, by 4 */
00290       blkCnt = ((uint32_t) blockSize2 >> 2U);
00291 
00292       while (blkCnt > 0U)
00293       {
00294         /* Set all accumulators to zero */
00295         acc0 = 0.0f;
00296         acc1 = 0.0f;
00297         acc2 = 0.0f;
00298         acc3 = 0.0f;
00299 
00300         /* read x[0], x[1], x[2] samples */
00301         x0 = *(px++);
00302         x1 = *(px++);
00303         x2 = *(px++);
00304 
00305         /* Apply loop unrolling and compute 4 MACs simultaneously. */
00306         k = srcBLen >> 2U;
00307 
00308         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00309          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00310         do
00311         {
00312           /* Read y[srcBLen - 1] sample */
00313           c0 = *(py--);
00314 
00315           /* Read x[3] sample */
00316           x3 = *(px++);
00317 
00318           /* Perform the multiply-accumulate */
00319           /* acc0 +=  x[0] * y[srcBLen - 1] */
00320           acc0 += x0 * c0;
00321 
00322           /* acc1 +=  x[1] * y[srcBLen - 1] */
00323           acc1 += x1 * c0;
00324 
00325           /* acc2 +=  x[2] * y[srcBLen - 1] */
00326           acc2 += x2 * c0;
00327 
00328           /* acc3 +=  x[3] * y[srcBLen - 1] */
00329           acc3 += x3 * c0;
00330 
00331           /* Read y[srcBLen - 2] sample */
00332           c0 = *(py--);
00333 
00334           /* Read x[4] sample */
00335           x0 = *(px++);
00336 
00337           /* Perform the multiply-accumulate */
00338           /* acc0 +=  x[1] * y[srcBLen - 2] */
00339           acc0 += x1 * c0;
00340           /* acc1 +=  x[2] * y[srcBLen - 2] */
00341           acc1 += x2 * c0;
00342           /* acc2 +=  x[3] * y[srcBLen - 2] */
00343           acc2 += x3 * c0;
00344           /* acc3 +=  x[4] * y[srcBLen - 2] */
00345           acc3 += x0 * c0;
00346 
00347           /* Read y[srcBLen - 3] sample */
00348           c0 = *(py--);
00349 
00350           /* Read x[5] sample */
00351           x1 = *(px++);
00352 
00353           /* Perform the multiply-accumulates */
00354           /* acc0 +=  x[2] * y[srcBLen - 3] */
00355           acc0 += x2 * c0;
00356           /* acc1 +=  x[3] * y[srcBLen - 2] */
00357           acc1 += x3 * c0;
00358           /* acc2 +=  x[4] * y[srcBLen - 2] */
00359           acc2 += x0 * c0;
00360           /* acc3 +=  x[5] * y[srcBLen - 2] */
00361           acc3 += x1 * c0;
00362 
00363           /* Read y[srcBLen - 4] sample */
00364           c0 = *(py--);
00365 
00366           /* Read x[6] sample */
00367           x2 = *(px++);
00368 
00369           /* Perform the multiply-accumulates */
00370           /* acc0 +=  x[3] * y[srcBLen - 4] */
00371           acc0 += x3 * c0;
00372           /* acc1 +=  x[4] * y[srcBLen - 4] */
00373           acc1 += x0 * c0;
00374           /* acc2 +=  x[5] * y[srcBLen - 4] */
00375           acc2 += x1 * c0;
00376           /* acc3 +=  x[6] * y[srcBLen - 4] */
00377           acc3 += x2 * c0;
00378 
00379 
00380         } while (--k);
00381 
00382         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
00383          ** No loop unrolling is used. */
00384         k = srcBLen % 0x4U;
00385 
00386         while (k > 0U)
00387         {
00388           /* Read y[srcBLen - 5] sample */
00389           c0 = *(py--);
00390 
00391           /* Read x[7] sample */
00392           x3 = *(px++);
00393 
00394           /* Perform the multiply-accumulates */
00395           /* acc0 +=  x[4] * y[srcBLen - 5] */
00396           acc0 += x0 * c0;
00397           /* acc1 +=  x[5] * y[srcBLen - 5] */
00398           acc1 += x1 * c0;
00399           /* acc2 +=  x[6] * y[srcBLen - 5] */
00400           acc2 += x2 * c0;
00401           /* acc3 +=  x[7] * y[srcBLen - 5] */
00402           acc3 += x3 * c0;
00403 
00404           /* Reuse the present samples for the next MAC */
00405           x0 = x1;
00406           x1 = x2;
00407           x2 = x3;
00408 
00409           /* Decrement the loop counter */
00410           k--;
00411         }
00412 
00413         /* Store the result in the accumulator in the destination buffer. */
00414         *pOut++ = acc0;
00415         *pOut++ = acc1;
00416         *pOut++ = acc2;
00417         *pOut++ = acc3;
00418 
00419         /* Increment the pointer pIn1 index, count by 1 */
00420         count += 4U;
00421 
00422         /* Update the inputA and inputB pointers for next MAC calculation */
00423         if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
00424         {
00425           px = pIn1 + firstIndex - srcBLen + 1 + count;
00426         }
00427         else
00428         {
00429           px = pIn1 + count;
00430         }
00431         py = pSrc2;
00432 
00433         /* Decrement the loop counter */
00434         blkCnt--;
00435       }
00436 
00437       /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
00438        ** No loop unrolling is used. */
00439       blkCnt = (uint32_t) blockSize2 % 0x4U;
00440 
00441       while (blkCnt > 0U)
00442       {
00443         /* Accumulator is made zero for every iteration */
00444         sum = 0.0f;
00445 
00446         /* Apply loop unrolling and compute 4 MACs simultaneously. */
00447         k = srcBLen >> 2U;
00448 
00449         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00450          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00451         while (k > 0U)
00452         {
00453           /* Perform the multiply-accumulates */
00454           sum += *px++ * *py--;
00455           sum += *px++ * *py--;
00456           sum += *px++ * *py--;
00457           sum += *px++ * *py--;
00458 
00459           /* Decrement the loop counter */
00460           k--;
00461         }
00462 
00463         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
00464          ** No loop unrolling is used. */
00465         k = srcBLen % 0x4U;
00466 
00467         while (k > 0U)
00468         {
00469           /* Perform the multiply-accumulate */
00470           sum += *px++ * *py--;
00471 
00472           /* Decrement the loop counter */
00473           k--;
00474         }
00475 
00476         /* Store the result in the accumulator in the destination buffer. */
00477         *pOut++ = sum;
00478 
00479         /* Increment the MAC count */
00480         count++;
00481 
00482         /* Update the inputA and inputB pointers for next MAC calculation */
00483         if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
00484         {
00485           px = pIn1 + firstIndex - srcBLen + 1 + count;
00486         }
00487         else
00488         {
00489           px = pIn1 + count;
00490         }
00491         py = pSrc2;
00492 
00493         /* Decrement the loop counter */
00494         blkCnt--;
00495       }
00496     }
00497     else
00498     {
00499       /* If the srcBLen is not a multiple of 4,
00500        * the blockSize2 loop cannot be unrolled by 4 */
00501       blkCnt = (uint32_t) blockSize2;
00502 
00503       while (blkCnt > 0U)
00504       {
00505         /* Accumulator is made zero for every iteration */
00506         sum = 0.0f;
00507 
00508         /* srcBLen number of MACS should be performed */
00509         k = srcBLen;
00510 
00511         while (k > 0U)
00512         {
00513           /* Perform the multiply-accumulate */
00514           sum += *px++ * *py--;
00515 
00516           /* Decrement the loop counter */
00517           k--;
00518         }
00519 
00520         /* Store the result in the accumulator in the destination buffer. */
00521         *pOut++ = sum;
00522 
00523         /* Increment the MAC count */
00524         count++;
00525 
00526         /* Update the inputA and inputB pointers for next MAC calculation */
00527         if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
00528         {
00529           px = pIn1 + firstIndex - srcBLen + 1 + count;
00530         }
00531         else
00532         {
00533           px = pIn1 + count;
00534         }
00535         py = pSrc2;
00536 
00537         /* Decrement the loop counter */
00538         blkCnt--;
00539       }
00540     }
00541 
00542 
00543     /* --------------------------
00544      * Initializations of stage3
00545      * -------------------------*/
00546 
00547     /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
00548      * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
00549      * ....
00550      * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
00551      * sum +=  x[srcALen-1] * y[srcBLen-1]
00552      */
00553 
00554     /* In this stage the MAC operations are decreased by 1 for every iteration.
00555        The count variable holds the number of MAC operations performed */
00556     count = srcBLen - 1U;
00557 
00558     /* Working pointer of inputA */
00559     pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
00560     px = pSrc1;
00561 
00562     /* Working pointer of inputB */
00563     pSrc2 = pIn2 + (srcBLen - 1U);
00564     py = pSrc2;
00565 
00566     while (blockSize3 > 0)
00567     {
00568       /* Accumulator is made zero for every iteration */
00569       sum = 0.0f;
00570 
00571       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00572       k = count >> 2U;
00573 
00574       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
00575        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00576       while (k > 0U)
00577       {
00578         /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
00579         sum += *px++ * *py--;
00580 
00581         /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
00582         sum += *px++ * *py--;
00583 
00584         /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
00585         sum += *px++ * *py--;
00586 
00587         /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
00588         sum += *px++ * *py--;
00589 
00590         /* Decrement the loop counter */
00591         k--;
00592       }
00593 
00594       /* If the count is not a multiple of 4, compute any remaining MACs here.
00595        ** No loop unrolling is used. */
00596       k = count % 0x4U;
00597 
00598       while (k > 0U)
00599       {
00600         /* Perform the multiply-accumulates */
00601         /* sum +=  x[srcALen-1] * y[srcBLen-1] */
00602         sum += *px++ * *py--;
00603 
00604         /* Decrement the loop counter */
00605         k--;
00606       }
00607 
00608       /* Store the result in the accumulator in the destination buffer. */
00609       *pOut++ = sum;
00610 
00611       /* Update the inputA and inputB pointers for next MAC calculation */
00612       px = ++pSrc1;
00613       py = pSrc2;
00614 
00615       /* Decrement the MAC count */
00616       count--;
00617 
00618       /* Decrement the loop counter */
00619       blockSize3--;
00620 
00621     }
00622 
00623     /* set status as ARM_MATH_SUCCESS */
00624     status = ARM_MATH_SUCCESS;
00625   }
00626 
00627   /* Return to application */
00628   return (status);
00629 
00630 #else
00631 
00632   /* Run the below code for Cortex-M0 */
00633 
00634   float32_t *pIn1 = pSrcA;                       /* inputA pointer */
00635   float32_t *pIn2 = pSrcB;                       /* inputB pointer */
00636   float32_t sum;                                 /* Accumulator */
00637   uint32_t i, j;                                 /* loop counters */
00638   arm_status status;                             /* status of Partial convolution */
00639 
00640   /* Check for range of output samples to be calculated */
00641   if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
00642   {
00643     /* Set status as ARM_ARGUMENT_ERROR */
00644     status = ARM_MATH_ARGUMENT_ERROR;
00645   }
00646   else
00647   {
00648     /* Loop to calculate convolution for output length number of values */
00649     for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++)
00650     {
00651       /* Initialize sum with zero to carry on MAC operations */
00652       sum = 0.0f;
00653 
00654       /* Loop to perform MAC operations according to convolution equation */
00655       for (j = 0U; j <= i; j++)
00656       {
00657         /* Check the array limitations for inputs */
00658         if ((((i - j) < srcBLen) && (j < srcALen)))
00659         {
00660           /* z[i] += x[i-j] * y[j] */
00661           sum += pIn1[j] * pIn2[i - j];
00662         }
00663       }
00664       /* Store the output in the destination buffer */
00665       pDst[i] = sum;
00666     }
00667     /* set status as ARM_SUCCESS as there are no argument errors */
00668     status = ARM_MATH_SUCCESS;
00669   }
00670   return (status);
00671 
00672 #endif /*   #if defined (ARM_MATH_DSP) */
00673 
00674 }
00675 
00676 /**
00677  * @} end of PartialConv group
00678  */
00679