CMSIS DSP library

Dependents:   KL25Z_FFT_Demo Hat_Board_v5_1 KL25Z_FFT_Demo_tony KL25Z_FFT_Demo_tony ... more

Fork of mbed-dsp by mbed official

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_conv_partial_f32.c Source File

arm_conv_partial_f32.c

00001 /* ----------------------------------------------------------------------------    
00002 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.    
00003 *    
00004 * $Date:        17. January 2013
00005 * $Revision:    V1.4.1
00006 *    
00007 * Project:      CMSIS DSP Library    
00008 * Title:        arm_conv_partial_f32.c    
00009 *    
00010 * Description:  Partial convolution of floating-point sequences.    
00011 *    
00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
00013 *  
00014 * Redistribution and use in source and binary forms, with or without 
00015 * modification, are permitted provided that the following conditions
00016 * are met:
00017 *   - Redistributions of source code must retain the above copyright
00018 *     notice, this list of conditions and the following disclaimer.
00019 *   - Redistributions in binary form must reproduce the above copyright
00020 *     notice, this list of conditions and the following disclaimer in
00021 *     the documentation and/or other materials provided with the 
00022 *     distribution.
00023 *   - Neither the name of ARM LIMITED nor the names of its contributors
00024 *     may be used to endorse or promote products derived from this
00025 *     software without specific prior written permission.
00026 *
00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00038 * POSSIBILITY OF SUCH DAMAGE.   
00039 * -------------------------------------------------------------------------- */
00040 
00041 #include "arm_math.h"
00042 
00043 /**    
00044  * @ingroup groupFilters    
00045  */
00046 
00047 /**    
00048  * @defgroup PartialConv Partial Convolution    
00049  *    
00050  * Partial Convolution is equivalent to Convolution except that a subset of the output samples is generated.    
00051  * Each function has two additional arguments.    
00052  * <code>firstIndex</code> specifies the starting index of the subset of output samples.    
00053  * <code>numPoints</code> is the number of output samples to compute.    
00054  * The function computes the output in the range    
00055  * <code>[firstIndex, ..., firstIndex+numPoints-1]</code>.    
00056  * The output array <code>pDst</code> contains <code>numPoints</code> values.    
00057  *    
00058  * The allowable range of output indices is [0 srcALen+srcBLen-2].    
00059  * If the requested subset does not fall in this range then the functions return ARM_MATH_ARGUMENT_ERROR.    
00060  * Otherwise the functions return ARM_MATH_SUCCESS.    
00061  * \note Refer arm_conv_f32() for details on fixed point behavior.   
00062  *
00063  * 
00064  * <b>Fast Versions</b>
00065  *
00066  * \par 
00067  * Fast versions are supported for Q31 and Q15 of partial convolution.  Cycles for Fast versions are less compared to Q31 and Q15 of partial conv and the design requires
00068  * the input signals should be scaled down to avoid intermediate overflows.   
00069  *
00070  *
00071  * <b>Opt Versions</b>
00072  *
00073  * \par 
00074  * Opt versions are supported for Q15 and Q7.  Design uses internal scratch buffer for getting good optimisation.
00075  * These versions are optimised in cycles and consumes more memory(Scratch memory) compared to Q15 and Q7 versions of partial convolution
00076  */
00077 
00078 /**    
00079  * @addtogroup PartialConv    
00080  * @{    
00081  */
00082 
00083 /**    
00084  * @brief Partial convolution of floating-point sequences.    
00085  * @param[in]       *pSrcA points to the first input sequence.    
00086  * @param[in]       srcALen length of the first input sequence.    
00087  * @param[in]       *pSrcB points to the second input sequence.    
00088  * @param[in]       srcBLen length of the second input sequence.    
00089  * @param[out]      *pDst points to the location where the output result is written.    
00090  * @param[in]       firstIndex is the first output sample to start with.    
00091  * @param[in]       numPoints is the number of output points to be computed.    
00092  * @return  Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].    
00093  */
00094 
00095 arm_status arm_conv_partial_f32(
00096   float32_t * pSrcA,
00097   uint32_t srcALen,
00098   float32_t * pSrcB,
00099   uint32_t srcBLen,
00100   float32_t * pDst,
00101   uint32_t firstIndex,
00102   uint32_t numPoints)
00103 {
00104 
00105 
00106 #ifndef ARM_MATH_CM0_FAMILY
00107 
00108   /* Run the below code for Cortex-M4 and Cortex-M3 */
00109 
00110   float32_t *pIn1 = pSrcA;                       /* inputA pointer */
00111   float32_t *pIn2 = pSrcB;                       /* inputB pointer */
00112   float32_t *pOut = pDst;                        /* output pointer */
00113   float32_t *px;                                 /* Intermediate inputA pointer */
00114   float32_t *py;                                 /* Intermediate inputB pointer */
00115   float32_t *pSrc1, *pSrc2;                      /* Intermediate pointers */
00116   float32_t sum, acc0, acc1, acc2, acc3;         /* Accumulator */
00117   float32_t x0, x1, x2, x3, c0;                  /* Temporary variables to hold state and coefficient values */
00118   uint32_t j, k, count = 0u, blkCnt, check;
00119   int32_t blockSize1, blockSize2, blockSize3;    /* loop counters */
00120   arm_status status;                             /* status of Partial convolution */
00121 
00122 
00123   /* Check for range of output samples to be calculated */
00124   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
00125   {
00126     /* Set status as ARM_MATH_ARGUMENT_ERROR */
00127     status = ARM_MATH_ARGUMENT_ERROR;
00128   }
00129   else
00130   {
00131 
00132     /* The algorithm implementation is based on the lengths of the inputs. */
00133     /* srcB is always made to slide across srcA. */
00134     /* So srcBLen is always considered as shorter or equal to srcALen */
00135     if(srcALen >= srcBLen)
00136     {
00137       /* Initialization of inputA pointer */
00138       pIn1 = pSrcA;
00139 
00140       /* Initialization of inputB pointer */
00141       pIn2 = pSrcB;
00142     }
00143     else
00144     {
00145       /* Initialization of inputA pointer */
00146       pIn1 = pSrcB;
00147 
00148       /* Initialization of inputB pointer */
00149       pIn2 = pSrcA;
00150 
00151       /* srcBLen is always considered as shorter or equal to srcALen */
00152       j = srcBLen;
00153       srcBLen = srcALen;
00154       srcALen = j;
00155     }
00156 
00157     /* Conditions to check which loopCounter holds    
00158      * the first and last indices of the output samples to be calculated. */
00159     check = firstIndex + numPoints;
00160     blockSize3 = (int32_t) check - (int32_t) srcALen;
00161     blockSize3 = (blockSize3 > 0) ? blockSize3 : 0;
00162     blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex;
00163     blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 :
00164                                      (int32_t) numPoints) : 0;
00165     blockSize2 = ((int32_t) check - blockSize3) -
00166       (blockSize1 + (int32_t) firstIndex);
00167     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
00168 
00169     /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
00170     /* The function is internally    
00171      * divided into three stages according to the number of multiplications that has to be    
00172      * taken place between inputA samples and inputB samples. In the first stage of the    
00173      * algorithm, the multiplications increase by one for every iteration.    
00174      * In the second stage of the algorithm, srcBLen number of multiplications are done.    
00175      * In the third stage of the algorithm, the multiplications decrease by one    
00176      * for every iteration. */
00177 
00178     /* Set the output pointer to point to the firstIndex    
00179      * of the output sample to be calculated. */
00180     pOut = pDst + firstIndex;
00181 
00182     /* --------------------------    
00183      * Initializations of stage1    
00184      * -------------------------*/
00185 
00186     /* sum = x[0] * y[0]    
00187      * sum = x[0] * y[1] + x[1] * y[0]    
00188      * ....    
00189      * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]    
00190      */
00191 
00192     /* In this stage the MAC operations are increased by 1 for every iteration.    
00193        The count variable holds the number of MAC operations performed.    
00194        Since the partial convolution starts from from firstIndex    
00195        Number of Macs to be performed is firstIndex + 1 */
00196     count = 1u + firstIndex;
00197 
00198     /* Working pointer of inputA */
00199     px = pIn1;
00200 
00201     /* Working pointer of inputB */
00202     pSrc1 = pIn2 + firstIndex;
00203     py = pSrc1;
00204 
00205     /* ------------------------    
00206      * Stage1 process    
00207      * ----------------------*/
00208 
00209     /* The first stage starts here */
00210     while(blockSize1 > 0)
00211     {
00212       /* Accumulator is made zero for every iteration */
00213       sum = 0.0f;
00214 
00215       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00216       k = count >> 2u;
00217 
00218       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.    
00219        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00220       while(k > 0u)
00221       {
00222         /* x[0] * y[srcBLen - 1] */
00223         sum += *px++ * *py--;
00224 
00225         /* x[1] * y[srcBLen - 2] */
00226         sum += *px++ * *py--;
00227 
00228         /* x[2] * y[srcBLen - 3] */
00229         sum += *px++ * *py--;
00230 
00231         /* x[3] * y[srcBLen - 4] */
00232         sum += *px++ * *py--;
00233 
00234         /* Decrement the loop counter */
00235         k--;
00236       }
00237 
00238       /* If the count is not a multiple of 4, compute any remaining MACs here.    
00239        ** No loop unrolling is used. */
00240       k = count % 0x4u;
00241 
00242       while(k > 0u)
00243       {
00244         /* Perform the multiply-accumulates */
00245         sum += *px++ * *py--;
00246 
00247         /* Decrement the loop counter */
00248         k--;
00249       }
00250 
00251       /* Store the result in the accumulator in the destination buffer. */
00252       *pOut++ = sum;
00253 
00254       /* Update the inputA and inputB pointers for next MAC calculation */
00255       py = ++pSrc1;
00256       px = pIn1;
00257 
00258       /* Increment the MAC count */
00259       count++;
00260 
00261       /* Decrement the loop counter */
00262       blockSize1--;
00263     }
00264 
00265     /* --------------------------    
00266      * Initializations of stage2    
00267      * ------------------------*/
00268 
00269     /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]    
00270      * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]    
00271      * ....    
00272      * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]    
00273      */
00274 
00275     /* Working pointer of inputA */
00276     px = pIn1;
00277 
00278     /* Working pointer of inputB */
00279     pSrc2 = pIn2 + (srcBLen - 1u);
00280     py = pSrc2;
00281 
00282     /* count is index by which the pointer pIn1 to be incremented */
00283     count = 0u;
00284 
00285     /* -------------------    
00286      * Stage2 process    
00287      * ------------------*/
00288 
00289     /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.    
00290      * So, to loop unroll over blockSize2,    
00291      * srcBLen should be greater than or equal to 4 */
00292     if(srcBLen >= 4u)
00293     {
00294       /* Loop unroll over blockSize2, by 4 */
00295       blkCnt = ((uint32_t) blockSize2 >> 2u);
00296 
00297       while(blkCnt > 0u)
00298       {
00299         /* Set all accumulators to zero */
00300         acc0 = 0.0f;
00301         acc1 = 0.0f;
00302         acc2 = 0.0f;
00303         acc3 = 0.0f;
00304 
00305         /* read x[0], x[1], x[2] samples */
00306         x0 = *(px++);
00307         x1 = *(px++);
00308         x2 = *(px++);
00309 
00310         /* Apply loop unrolling and compute 4 MACs simultaneously. */
00311         k = srcBLen >> 2u;
00312 
00313         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.    
00314          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00315         do
00316         {
00317           /* Read y[srcBLen - 1] sample */
00318           c0 = *(py--);
00319 
00320           /* Read x[3] sample */
00321           x3 = *(px++);
00322 
00323           /* Perform the multiply-accumulate */
00324           /* acc0 +=  x[0] * y[srcBLen - 1] */
00325           acc0 += x0 * c0;
00326 
00327           /* acc1 +=  x[1] * y[srcBLen - 1] */
00328           acc1 += x1 * c0;
00329 
00330           /* acc2 +=  x[2] * y[srcBLen - 1] */
00331           acc2 += x2 * c0;
00332 
00333           /* acc3 +=  x[3] * y[srcBLen - 1] */
00334           acc3 += x3 * c0;
00335 
00336           /* Read y[srcBLen - 2] sample */
00337           c0 = *(py--);
00338 
00339           /* Read x[4] sample */
00340           x0 = *(px++);
00341 
00342           /* Perform the multiply-accumulate */
00343           /* acc0 +=  x[1] * y[srcBLen - 2] */
00344           acc0 += x1 * c0;
00345           /* acc1 +=  x[2] * y[srcBLen - 2] */
00346           acc1 += x2 * c0;
00347           /* acc2 +=  x[3] * y[srcBLen - 2] */
00348           acc2 += x3 * c0;
00349           /* acc3 +=  x[4] * y[srcBLen - 2] */
00350           acc3 += x0 * c0;
00351 
00352           /* Read y[srcBLen - 3] sample */
00353           c0 = *(py--);
00354 
00355           /* Read x[5] sample */
00356           x1 = *(px++);
00357 
00358           /* Perform the multiply-accumulates */
00359           /* acc0 +=  x[2] * y[srcBLen - 3] */
00360           acc0 += x2 * c0;
00361           /* acc1 +=  x[3] * y[srcBLen - 2] */
00362           acc1 += x3 * c0;
00363           /* acc2 +=  x[4] * y[srcBLen - 2] */
00364           acc2 += x0 * c0;
00365           /* acc3 +=  x[5] * y[srcBLen - 2] */
00366           acc3 += x1 * c0;
00367 
00368           /* Read y[srcBLen - 4] sample */
00369           c0 = *(py--);
00370 
00371           /* Read x[6] sample */
00372           x2 = *(px++);
00373 
00374           /* Perform the multiply-accumulates */
00375           /* acc0 +=  x[3] * y[srcBLen - 4] */
00376           acc0 += x3 * c0;
00377           /* acc1 +=  x[4] * y[srcBLen - 4] */
00378           acc1 += x0 * c0;
00379           /* acc2 +=  x[5] * y[srcBLen - 4] */
00380           acc2 += x1 * c0;
00381           /* acc3 +=  x[6] * y[srcBLen - 4] */
00382           acc3 += x2 * c0;
00383 
00384 
00385         } while(--k);
00386 
00387         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.    
00388          ** No loop unrolling is used. */
00389         k = srcBLen % 0x4u;
00390 
00391         while(k > 0u)
00392         {
00393           /* Read y[srcBLen - 5] sample */
00394           c0 = *(py--);
00395 
00396           /* Read x[7] sample */
00397           x3 = *(px++);
00398 
00399           /* Perform the multiply-accumulates */
00400           /* acc0 +=  x[4] * y[srcBLen - 5] */
00401           acc0 += x0 * c0;
00402           /* acc1 +=  x[5] * y[srcBLen - 5] */
00403           acc1 += x1 * c0;
00404           /* acc2 +=  x[6] * y[srcBLen - 5] */
00405           acc2 += x2 * c0;
00406           /* acc3 +=  x[7] * y[srcBLen - 5] */
00407           acc3 += x3 * c0;
00408 
00409           /* Reuse the present samples for the next MAC */
00410           x0 = x1;
00411           x1 = x2;
00412           x2 = x3;
00413 
00414           /* Decrement the loop counter */
00415           k--;
00416         }
00417 
00418         /* Store the result in the accumulator in the destination buffer. */
00419         *pOut++ = acc0;
00420         *pOut++ = acc1;
00421         *pOut++ = acc2;
00422         *pOut++ = acc3;
00423 
00424         /* Increment the pointer pIn1 index, count by 1 */
00425         count += 4u;
00426 
00427         /* Update the inputA and inputB pointers for next MAC calculation */
00428         px = pIn1 + count;
00429         py = pSrc2;
00430 
00431         /* Decrement the loop counter */
00432         blkCnt--;
00433       }
00434 
00435       /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.    
00436        ** No loop unrolling is used. */
00437       blkCnt = (uint32_t) blockSize2 % 0x4u;
00438 
00439       while(blkCnt > 0u)
00440       {
00441         /* Accumulator is made zero for every iteration */
00442         sum = 0.0f;
00443 
00444         /* Apply loop unrolling and compute 4 MACs simultaneously. */
00445         k = srcBLen >> 2u;
00446 
00447         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.    
00448          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00449         while(k > 0u)
00450         {
00451           /* Perform the multiply-accumulates */
00452           sum += *px++ * *py--;
00453           sum += *px++ * *py--;
00454           sum += *px++ * *py--;
00455           sum += *px++ * *py--;
00456 
00457           /* Decrement the loop counter */
00458           k--;
00459         }
00460 
00461         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.    
00462          ** No loop unrolling is used. */
00463         k = srcBLen % 0x4u;
00464 
00465         while(k > 0u)
00466         {
00467           /* Perform the multiply-accumulate */
00468           sum += *px++ * *py--;
00469 
00470           /* Decrement the loop counter */
00471           k--;
00472         }
00473 
00474         /* Store the result in the accumulator in the destination buffer. */
00475         *pOut++ = sum;
00476 
00477         /* Increment the MAC count */
00478         count++;
00479 
00480         /* Update the inputA and inputB pointers for next MAC calculation */
00481         px = pIn1 + count;
00482         py = pSrc2;
00483 
00484         /* Decrement the loop counter */
00485         blkCnt--;
00486       }
00487     }
00488     else
00489     {
00490       /* If the srcBLen is not a multiple of 4,    
00491        * the blockSize2 loop cannot be unrolled by 4 */
00492       blkCnt = (uint32_t) blockSize2;
00493 
00494       while(blkCnt > 0u)
00495       {
00496         /* Accumulator is made zero for every iteration */
00497         sum = 0.0f;
00498 
00499         /* srcBLen number of MACS should be performed */
00500         k = srcBLen;
00501 
00502         while(k > 0u)
00503         {
00504           /* Perform the multiply-accumulate */
00505           sum += *px++ * *py--;
00506 
00507           /* Decrement the loop counter */
00508           k--;
00509         }
00510 
00511         /* Store the result in the accumulator in the destination buffer. */
00512         *pOut++ = sum;
00513 
00514         /* Increment the MAC count */
00515         count++;
00516 
00517         /* Update the inputA and inputB pointers for next MAC calculation */
00518         px = pIn1 + count;
00519         py = pSrc2;
00520 
00521         /* Decrement the loop counter */
00522         blkCnt--;
00523       }
00524     }
00525 
00526 
00527     /* --------------------------    
00528      * Initializations of stage3    
00529      * -------------------------*/
00530 
00531     /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]    
00532      * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]    
00533      * ....    
00534      * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]    
00535      * sum +=  x[srcALen-1] * y[srcBLen-1]    
00536      */
00537 
00538     /* In this stage the MAC operations are decreased by 1 for every iteration.    
00539        The count variable holds the number of MAC operations performed */
00540     count = srcBLen - 1u;
00541 
00542     /* Working pointer of inputA */
00543     pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
00544     px = pSrc1;
00545 
00546     /* Working pointer of inputB */
00547     pSrc2 = pIn2 + (srcBLen - 1u);
00548     py = pSrc2;
00549 
00550     while(blockSize3 > 0)
00551     {
00552       /* Accumulator is made zero for every iteration */
00553       sum = 0.0f;
00554 
00555       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00556       k = count >> 2u;
00557 
00558       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.    
00559        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00560       while(k > 0u)
00561       {
00562         /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
00563         sum += *px++ * *py--;
00564 
00565         /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
00566         sum += *px++ * *py--;
00567 
00568         /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
00569         sum += *px++ * *py--;
00570 
00571         /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
00572         sum += *px++ * *py--;
00573 
00574         /* Decrement the loop counter */
00575         k--;
00576       }
00577 
00578       /* If the count is not a multiple of 4, compute any remaining MACs here.    
00579        ** No loop unrolling is used. */
00580       k = count % 0x4u;
00581 
00582       while(k > 0u)
00583       {
00584         /* Perform the multiply-accumulates */
00585         /* sum +=  x[srcALen-1] * y[srcBLen-1] */
00586         sum += *px++ * *py--;
00587 
00588         /* Decrement the loop counter */
00589         k--;
00590       }
00591 
00592       /* Store the result in the accumulator in the destination buffer. */
00593       *pOut++ = sum;
00594 
00595       /* Update the inputA and inputB pointers for next MAC calculation */
00596       px = ++pSrc1;
00597       py = pSrc2;
00598 
00599       /* Decrement the MAC count */
00600       count--;
00601 
00602       /* Decrement the loop counter */
00603       blockSize3--;
00604 
00605     }
00606 
00607     /* set status as ARM_MATH_SUCCESS */
00608     status = ARM_MATH_SUCCESS;
00609   }
00610 
00611   /* Return to application */
00612   return (status);
00613 
00614 #else
00615 
00616   /* Run the below code for Cortex-M0 */
00617 
00618   float32_t *pIn1 = pSrcA;                       /* inputA pointer */
00619   float32_t *pIn2 = pSrcB;                       /* inputB pointer */
00620   float32_t sum;                                 /* Accumulator */
00621   uint32_t i, j;                                 /* loop counters */
00622   arm_status status;                             /* status of Partial convolution */
00623 
00624   /* Check for range of output samples to be calculated */
00625   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
00626   {
00627     /* Set status as ARM_ARGUMENT_ERROR */
00628     status = ARM_MATH_ARGUMENT_ERROR;
00629   }
00630   else
00631   {
00632     /* Loop to calculate convolution for output length number of values */
00633     for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++)
00634     {
00635       /* Initialize sum with zero to carry on MAC operations */
00636       sum = 0.0f;
00637 
00638       /* Loop to perform MAC operations according to convolution equation */
00639       for (j = 0u; j <= i; j++)
00640       {
00641         /* Check the array limitations for inputs */
00642         if((((i - j) < srcBLen) && (j < srcALen)))
00643         {
00644           /* z[i] += x[i-j] * y[j] */
00645           sum += pIn1[j] * pIn2[i - j];
00646         }
00647       }
00648       /* Store the output in the destination buffer */
00649       pDst[i] = sum;
00650     }
00651     /* set status as ARM_SUCCESS as there are no argument errors */
00652     status = ARM_MATH_SUCCESS;
00653   }
00654   return (status);
00655 
00656 #endif /*   #ifndef ARM_MATH_CM0_FAMILY */
00657 
00658 }
00659 
00660 /**    
00661  * @} end of PartialConv group    
00662  */