CMSIS DSP library

Dependents:   performance_timer Surfboard_ gps2rtty Capstone ... more

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_conv_partial_f32.c Source File

arm_conv_partial_f32.c

00001 /* ----------------------------------------------------------------------------    
00002 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
00003 *    
00004 * $Date:        19. March 2015
00005 * $Revision:    V.1.4.5
00006 *    
00007 * Project:      CMSIS DSP Library    
00008 * Title:        arm_conv_partial_f32.c    
00009 *    
00010 * Description:  Partial convolution of floating-point sequences.    
00011 *    
00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
00013 *  
00014 * Redistribution and use in source and binary forms, with or without 
00015 * modification, are permitted provided that the following conditions
00016 * are met:
00017 *   - Redistributions of source code must retain the above copyright
00018 *     notice, this list of conditions and the following disclaimer.
00019 *   - Redistributions in binary form must reproduce the above copyright
00020 *     notice, this list of conditions and the following disclaimer in
00021 *     the documentation and/or other materials provided with the 
00022 *     distribution.
00023 *   - Neither the name of ARM LIMITED nor the names of its contributors
00024 *     may be used to endorse or promote products derived from this
00025 *     software without specific prior written permission.
00026 *
00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00038 * POSSIBILITY OF SUCH DAMAGE.   
00039 * -------------------------------------------------------------------------- */
00040 
00041 #include "arm_math.h"
00042 
00043 /**    
00044  * @ingroup groupFilters    
00045  */
00046 
00047 /**    
00048  * @defgroup PartialConv Partial Convolution    
00049  *    
00050  * Partial Convolution is equivalent to Convolution except that a subset of the output samples is generated.    
00051  * Each function has two additional arguments.    
00052  * <code>firstIndex</code> specifies the starting index of the subset of output samples.    
00053  * <code>numPoints</code> is the number of output samples to compute.    
00054  * The function computes the output in the range    
00055  * <code>[firstIndex, ..., firstIndex+numPoints-1]</code>.    
00056  * The output array <code>pDst</code> contains <code>numPoints</code> values.    
00057  *    
00058  * The allowable range of output indices is [0 srcALen+srcBLen-2].    
00059  * If the requested subset does not fall in this range then the functions return ARM_MATH_ARGUMENT_ERROR.    
00060  * Otherwise the functions return ARM_MATH_SUCCESS.    
00061  * \note Refer arm_conv_f32() for details on fixed point behavior.   
00062  *
00063  * 
00064  * <b>Fast Versions</b>
00065  *
00066  * \par 
00067  * Fast versions are supported for Q31 and Q15 of partial convolution.  Cycles for Fast versions are less compared to Q31 and Q15 of partial conv and the design requires
00068  * the input signals should be scaled down to avoid intermediate overflows.   
00069  *
00070  *
00071  * <b>Opt Versions</b>
00072  *
00073  * \par 
00074  * Opt versions are supported for Q15 and Q7.  Design uses internal scratch buffer for getting good optimisation.
00075  * These versions are optimised in cycles and consumes more memory(Scratch memory) compared to Q15 and Q7 versions of partial convolution
00076  */
00077 
00078 /**    
00079  * @addtogroup PartialConv    
00080  * @{    
00081  */
00082 
00083 /**    
00084  * @brief Partial convolution of floating-point sequences.    
00085  * @param[in]       *pSrcA points to the first input sequence.    
00086  * @param[in]       srcALen length of the first input sequence.    
00087  * @param[in]       *pSrcB points to the second input sequence.    
00088  * @param[in]       srcBLen length of the second input sequence.    
00089  * @param[out]      *pDst points to the location where the output result is written.    
00090  * @param[in]       firstIndex is the first output sample to start with.    
00091  * @param[in]       numPoints is the number of output points to be computed.    
00092  * @return  Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].    
00093  */
00094 
00095 arm_status arm_conv_partial_f32(
00096   float32_t * pSrcA,
00097   uint32_t srcALen,
00098   float32_t * pSrcB,
00099   uint32_t srcBLen,
00100   float32_t * pDst,
00101   uint32_t firstIndex,
00102   uint32_t numPoints)
00103 {
00104 
00105 
00106 #ifndef ARM_MATH_CM0_FAMILY
00107 
00108   /* Run the below code for Cortex-M4 and Cortex-M3 */
00109 
00110   float32_t *pIn1 = pSrcA;                       /* inputA pointer */
00111   float32_t *pIn2 = pSrcB;                       /* inputB pointer */
00112   float32_t *pOut = pDst;                        /* output pointer */
00113   float32_t *px;                                 /* Intermediate inputA pointer */
00114   float32_t *py;                                 /* Intermediate inputB pointer */
00115   float32_t *pSrc1, *pSrc2;                      /* Intermediate pointers */
00116   float32_t sum, acc0, acc1, acc2, acc3;         /* Accumulator */
00117   float32_t x0, x1, x2, x3, c0;                  /* Temporary variables to hold state and coefficient values */
00118   uint32_t j, k, count = 0u, blkCnt, check;
00119   int32_t blockSize1, blockSize2, blockSize3;    /* loop counters */
00120   arm_status status;                             /* status of Partial convolution */
00121 
00122 
00123   /* Check for range of output samples to be calculated */
00124   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
00125   {
00126     /* Set status as ARM_MATH_ARGUMENT_ERROR */
00127     status = ARM_MATH_ARGUMENT_ERROR;
00128   }
00129   else
00130   {
00131 
00132     /* The algorithm implementation is based on the lengths of the inputs. */
00133     /* srcB is always made to slide across srcA. */
00134     /* So srcBLen is always considered as shorter or equal to srcALen */
00135     if(srcALen >= srcBLen)
00136     {
00137       /* Initialization of inputA pointer */
00138       pIn1 = pSrcA;
00139 
00140       /* Initialization of inputB pointer */
00141       pIn2 = pSrcB;
00142     }
00143     else
00144     {
00145       /* Initialization of inputA pointer */
00146       pIn1 = pSrcB;
00147 
00148       /* Initialization of inputB pointer */
00149       pIn2 = pSrcA;
00150 
00151       /* srcBLen is always considered as shorter or equal to srcALen */
00152       j = srcBLen;
00153       srcBLen = srcALen;
00154       srcALen = j;
00155     }
00156 
00157     /* Conditions to check which loopCounter holds    
00158      * the first and last indices of the output samples to be calculated. */
00159     check = firstIndex + numPoints;
00160     blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
00161     blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
00162     blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex;
00163     blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 :
00164                                      (int32_t) numPoints) : 0;
00165     blockSize2 = ((int32_t) check - blockSize3) -
00166       (blockSize1 + (int32_t) firstIndex);
00167     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
00168 
00169     /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
00170     /* The function is internally    
00171      * divided into three stages according to the number of multiplications that has to be    
00172      * taken place between inputA samples and inputB samples. In the first stage of the    
00173      * algorithm, the multiplications increase by one for every iteration.    
00174      * In the second stage of the algorithm, srcBLen number of multiplications are done.    
00175      * In the third stage of the algorithm, the multiplications decrease by one    
00176      * for every iteration. */
00177 
00178     /* Set the output pointer to point to the firstIndex    
00179      * of the output sample to be calculated. */
00180     pOut = pDst + firstIndex;
00181 
00182     /* --------------------------    
00183      * Initializations of stage1    
00184      * -------------------------*/
00185 
00186     /* sum = x[0] * y[0]    
00187      * sum = x[0] * y[1] + x[1] * y[0]    
00188      * ....    
00189      * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]    
00190      */
00191 
00192     /* In this stage the MAC operations are increased by 1 for every iteration.    
00193        The count variable holds the number of MAC operations performed.    
00194        Since the partial convolution starts from from firstIndex    
00195        Number of Macs to be performed is firstIndex + 1 */
00196     count = 1u + firstIndex;
00197 
00198     /* Working pointer of inputA */
00199     px = pIn1;
00200 
00201     /* Working pointer of inputB */
00202     pSrc1 = pIn2 + firstIndex;
00203     py = pSrc1;
00204 
00205     /* ------------------------    
00206      * Stage1 process    
00207      * ----------------------*/
00208 
00209     /* The first stage starts here */
00210     while(blockSize1 > 0)
00211     {
00212       /* Accumulator is made zero for every iteration */
00213       sum = 0.0f;
00214 
00215       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00216       k = count >> 2u;
00217 
00218       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.    
00219        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00220       while(k > 0u)
00221       {
00222         /* x[0] * y[srcBLen - 1] */
00223         sum += *px++ * *py--;
00224 
00225         /* x[1] * y[srcBLen - 2] */
00226         sum += *px++ * *py--;
00227 
00228         /* x[2] * y[srcBLen - 3] */
00229         sum += *px++ * *py--;
00230 
00231         /* x[3] * y[srcBLen - 4] */
00232         sum += *px++ * *py--;
00233 
00234         /* Decrement the loop counter */
00235         k--;
00236       }
00237 
00238       /* If the count is not a multiple of 4, compute any remaining MACs here.    
00239        ** No loop unrolling is used. */
00240       k = count % 0x4u;
00241 
00242       while(k > 0u)
00243       {
00244         /* Perform the multiply-accumulates */
00245         sum += *px++ * *py--;
00246 
00247         /* Decrement the loop counter */
00248         k--;
00249       }
00250 
00251       /* Store the result in the accumulator in the destination buffer. */
00252       *pOut++ = sum;
00253 
00254       /* Update the inputA and inputB pointers for next MAC calculation */
00255       py = ++pSrc1;
00256       px = pIn1;
00257 
00258       /* Increment the MAC count */
00259       count++;
00260 
00261       /* Decrement the loop counter */
00262       blockSize1--;
00263     }
00264 
00265     /* --------------------------    
00266      * Initializations of stage2    
00267      * ------------------------*/
00268 
00269     /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]    
00270      * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]    
00271      * ....    
00272      * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]    
00273      */
00274 
00275     /* Working pointer of inputA */
00276     if((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
00277     {
00278       px = pIn1 + firstIndex - srcBLen + 1;
00279     }
00280     else
00281     {
00282       px = pIn1;
00283     }
00284 
00285     /* Working pointer of inputB */
00286     pSrc2 = pIn2 + (srcBLen - 1u);
00287     py = pSrc2;
00288 
00289     /* count is index by which the pointer pIn1 to be incremented */
00290     count = 0u;
00291 
00292     /* -------------------    
00293      * Stage2 process    
00294      * ------------------*/
00295 
00296     /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.    
00297      * So, to loop unroll over blockSize2,    
00298      * srcBLen should be greater than or equal to 4 */
00299     if(srcBLen >= 4u)
00300     {
00301       /* Loop unroll over blockSize2, by 4 */
00302       blkCnt = ((uint32_t) blockSize2 >> 2u);
00303 
00304       while(blkCnt > 0u)
00305       {
00306         /* Set all accumulators to zero */
00307         acc0 = 0.0f;
00308         acc1 = 0.0f;
00309         acc2 = 0.0f;
00310         acc3 = 0.0f;
00311 
00312         /* read x[0], x[1], x[2] samples */
00313         x0 = *(px++);
00314         x1 = *(px++);
00315         x2 = *(px++);
00316 
00317         /* Apply loop unrolling and compute 4 MACs simultaneously. */
00318         k = srcBLen >> 2u;
00319 
00320         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.    
00321          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00322         do
00323         {
00324           /* Read y[srcBLen - 1] sample */
00325           c0 = *(py--);
00326 
00327           /* Read x[3] sample */
00328           x3 = *(px++);
00329 
00330           /* Perform the multiply-accumulate */
00331           /* acc0 +=  x[0] * y[srcBLen - 1] */
00332           acc0 += x0 * c0;
00333 
00334           /* acc1 +=  x[1] * y[srcBLen - 1] */
00335           acc1 += x1 * c0;
00336 
00337           /* acc2 +=  x[2] * y[srcBLen - 1] */
00338           acc2 += x2 * c0;
00339 
00340           /* acc3 +=  x[3] * y[srcBLen - 1] */
00341           acc3 += x3 * c0;
00342 
00343           /* Read y[srcBLen - 2] sample */
00344           c0 = *(py--);
00345 
00346           /* Read x[4] sample */
00347           x0 = *(px++);
00348 
00349           /* Perform the multiply-accumulate */
00350           /* acc0 +=  x[1] * y[srcBLen - 2] */
00351           acc0 += x1 * c0;
00352           /* acc1 +=  x[2] * y[srcBLen - 2] */
00353           acc1 += x2 * c0;
00354           /* acc2 +=  x[3] * y[srcBLen - 2] */
00355           acc2 += x3 * c0;
00356           /* acc3 +=  x[4] * y[srcBLen - 2] */
00357           acc3 += x0 * c0;
00358 
00359           /* Read y[srcBLen - 3] sample */
00360           c0 = *(py--);
00361 
00362           /* Read x[5] sample */
00363           x1 = *(px++);
00364 
00365           /* Perform the multiply-accumulates */
00366           /* acc0 +=  x[2] * y[srcBLen - 3] */
00367           acc0 += x2 * c0;
00368           /* acc1 +=  x[3] * y[srcBLen - 2] */
00369           acc1 += x3 * c0;
00370           /* acc2 +=  x[4] * y[srcBLen - 2] */
00371           acc2 += x0 * c0;
00372           /* acc3 +=  x[5] * y[srcBLen - 2] */
00373           acc3 += x1 * c0;
00374 
00375           /* Read y[srcBLen - 4] sample */
00376           c0 = *(py--);
00377 
00378           /* Read x[6] sample */
00379           x2 = *(px++);
00380 
00381           /* Perform the multiply-accumulates */
00382           /* acc0 +=  x[3] * y[srcBLen - 4] */
00383           acc0 += x3 * c0;
00384           /* acc1 +=  x[4] * y[srcBLen - 4] */
00385           acc1 += x0 * c0;
00386           /* acc2 +=  x[5] * y[srcBLen - 4] */
00387           acc2 += x1 * c0;
00388           /* acc3 +=  x[6] * y[srcBLen - 4] */
00389           acc3 += x2 * c0;
00390 
00391 
00392         } while(--k);
00393 
00394         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.    
00395          ** No loop unrolling is used. */
00396         k = srcBLen % 0x4u;
00397 
00398         while(k > 0u)
00399         {
00400           /* Read y[srcBLen - 5] sample */
00401           c0 = *(py--);
00402 
00403           /* Read x[7] sample */
00404           x3 = *(px++);
00405 
00406           /* Perform the multiply-accumulates */
00407           /* acc0 +=  x[4] * y[srcBLen - 5] */
00408           acc0 += x0 * c0;
00409           /* acc1 +=  x[5] * y[srcBLen - 5] */
00410           acc1 += x1 * c0;
00411           /* acc2 +=  x[6] * y[srcBLen - 5] */
00412           acc2 += x2 * c0;
00413           /* acc3 +=  x[7] * y[srcBLen - 5] */
00414           acc3 += x3 * c0;
00415 
00416           /* Reuse the present samples for the next MAC */
00417           x0 = x1;
00418           x1 = x2;
00419           x2 = x3;
00420 
00421           /* Decrement the loop counter */
00422           k--;
00423         }
00424 
00425         /* Store the result in the accumulator in the destination buffer. */
00426         *pOut++ = acc0;
00427         *pOut++ = acc1;
00428         *pOut++ = acc2;
00429         *pOut++ = acc3;
00430 
00431         /* Increment the pointer pIn1 index, count by 1 */
00432         count += 4u;
00433 
00434         /* Update the inputA and inputB pointers for next MAC calculation */
00435         px = pIn1 + count;
00436         py = pSrc2;
00437 
00438         /* Decrement the loop counter */
00439         blkCnt--;
00440       }
00441 
00442       /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.    
00443        ** No loop unrolling is used. */
00444       blkCnt = (uint32_t) blockSize2 % 0x4u;
00445 
00446       while(blkCnt > 0u)
00447       {
00448         /* Accumulator is made zero for every iteration */
00449         sum = 0.0f;
00450 
00451         /* Apply loop unrolling and compute 4 MACs simultaneously. */
00452         k = srcBLen >> 2u;
00453 
00454         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.    
00455          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00456         while(k > 0u)
00457         {
00458           /* Perform the multiply-accumulates */
00459           sum += *px++ * *py--;
00460           sum += *px++ * *py--;
00461           sum += *px++ * *py--;
00462           sum += *px++ * *py--;
00463 
00464           /* Decrement the loop counter */
00465           k--;
00466         }
00467 
00468         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.    
00469          ** No loop unrolling is used. */
00470         k = srcBLen % 0x4u;
00471 
00472         while(k > 0u)
00473         {
00474           /* Perform the multiply-accumulate */
00475           sum += *px++ * *py--;
00476 
00477           /* Decrement the loop counter */
00478           k--;
00479         }
00480 
00481         /* Store the result in the accumulator in the destination buffer. */
00482         *pOut++ = sum;
00483 
00484         /* Increment the MAC count */
00485         count++;
00486 
00487         /* Update the inputA and inputB pointers for next MAC calculation */
00488         px = pIn1 + count;
00489         py = pSrc2;
00490 
00491         /* Decrement the loop counter */
00492         blkCnt--;
00493       }
00494     }
00495     else
00496     {
00497       /* If the srcBLen is not a multiple of 4,    
00498        * the blockSize2 loop cannot be unrolled by 4 */
00499       blkCnt = (uint32_t) blockSize2;
00500 
00501       while(blkCnt > 0u)
00502       {
00503         /* Accumulator is made zero for every iteration */
00504         sum = 0.0f;
00505 
00506         /* srcBLen number of MACS should be performed */
00507         k = srcBLen;
00508 
00509         while(k > 0u)
00510         {
00511           /* Perform the multiply-accumulate */
00512           sum += *px++ * *py--;
00513 
00514           /* Decrement the loop counter */
00515           k--;
00516         }
00517 
00518         /* Store the result in the accumulator in the destination buffer. */
00519         *pOut++ = sum;
00520 
00521         /* Increment the MAC count */
00522         count++;
00523 
00524         /* Update the inputA and inputB pointers for next MAC calculation */
00525         px = pIn1 + count;
00526         py = pSrc2;
00527 
00528         /* Decrement the loop counter */
00529         blkCnt--;
00530       }
00531     }
00532 
00533 
00534     /* --------------------------    
00535      * Initializations of stage3    
00536      * -------------------------*/
00537 
00538     /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]    
00539      * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]    
00540      * ....    
00541      * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]    
00542      * sum +=  x[srcALen-1] * y[srcBLen-1]    
00543      */
00544 
00545     /* In this stage the MAC operations are decreased by 1 for every iteration.    
00546        The count variable holds the number of MAC operations performed */
00547     count = srcBLen - 1u;
00548 
00549     /* Working pointer of inputA */
00550     pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
00551     px = pSrc1;
00552 
00553     /* Working pointer of inputB */
00554     pSrc2 = pIn2 + (srcBLen - 1u);
00555     py = pSrc2;
00556 
00557     while(blockSize3 > 0)
00558     {
00559       /* Accumulator is made zero for every iteration */
00560       sum = 0.0f;
00561 
00562       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00563       k = count >> 2u;
00564 
00565       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.    
00566        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00567       while(k > 0u)
00568       {
00569         /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
00570         sum += *px++ * *py--;
00571 
00572         /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
00573         sum += *px++ * *py--;
00574 
00575         /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
00576         sum += *px++ * *py--;
00577 
00578         /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
00579         sum += *px++ * *py--;
00580 
00581         /* Decrement the loop counter */
00582         k--;
00583       }
00584 
00585       /* If the count is not a multiple of 4, compute any remaining MACs here.    
00586        ** No loop unrolling is used. */
00587       k = count % 0x4u;
00588 
00589       while(k > 0u)
00590       {
00591         /* Perform the multiply-accumulates */
00592         /* sum +=  x[srcALen-1] * y[srcBLen-1] */
00593         sum += *px++ * *py--;
00594 
00595         /* Decrement the loop counter */
00596         k--;
00597       }
00598 
00599       /* Store the result in the accumulator in the destination buffer. */
00600       *pOut++ = sum;
00601 
00602       /* Update the inputA and inputB pointers for next MAC calculation */
00603       px = ++pSrc1;
00604       py = pSrc2;
00605 
00606       /* Decrement the MAC count */
00607       count--;
00608 
00609       /* Decrement the loop counter */
00610       blockSize3--;
00611 
00612     }
00613 
00614     /* set status as ARM_MATH_SUCCESS */
00615     status = ARM_MATH_SUCCESS;
00616   }
00617 
00618   /* Return to application */
00619   return (status);
00620 
00621 #else
00622 
00623   /* Run the below code for Cortex-M0 */
00624 
00625   float32_t *pIn1 = pSrcA;                       /* inputA pointer */
00626   float32_t *pIn2 = pSrcB;                       /* inputB pointer */
00627   float32_t sum;                                 /* Accumulator */
00628   uint32_t i, j;                                 /* loop counters */
00629   arm_status status;                             /* status of Partial convolution */
00630 
00631   /* Check for range of output samples to be calculated */
00632   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
00633   {
00634     /* Set status as ARM_ARGUMENT_ERROR */
00635     status = ARM_MATH_ARGUMENT_ERROR;
00636   }
00637   else
00638   {
00639     /* Loop to calculate convolution for output length number of values */
00640     for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++)
00641     {
00642       /* Initialize sum with zero to carry on MAC operations */
00643       sum = 0.0f;
00644 
00645       /* Loop to perform MAC operations according to convolution equation */
00646       for (j = 0u; j <= i; j++)
00647       {
00648         /* Check the array limitations for inputs */
00649         if((((i - j) < srcBLen) && (j < srcALen)))
00650         {
00651           /* z[i] += x[i-j] * y[j] */
00652           sum += pIn1[j] * pIn2[i - j];
00653         }
00654       }
00655       /* Store the output in the destination buffer */
00656       pDst[i] = sum;
00657     }
00658     /* set status as ARM_SUCCESS as there are no argument errors */
00659     status = ARM_MATH_SUCCESS;
00660   }
00661   return (status);
00662 
00663 #endif /*   #ifndef ARM_MATH_CM0_FAMILY */
00664 
00665 }
00666 
00667 /**    
00668  * @} end of PartialConv group    
00669  */