CMSIS DSP Library from CMSIS 2.0. See http://www.onarm.com/cmsis/ for full details

Dependents:   K22F_DSP_Matrix_least_square BNO055-ELEC3810 1BNO055 ECE4180Project--Slave2 ... more

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_conv_partial_f32.c Source File

arm_conv_partial_f32.c

00001 /* ----------------------------------------------------------------------------  
00002 * Copyright (C) 2010 ARM Limited. All rights reserved.  
00003 *  
00004 * $Date:        29. November 2010  
00005 * $Revision:    V1.0.3  
00006 *  
00007 * Project:      CMSIS DSP Library  
00008 * Title:        arm_conv_partial_f32.c  
00009 *  
00010 * Description:  Partial Convolution of floating-point sequences  
00011 *  
00012 * Target Processor: Cortex-M4/Cortex-M3
00013 *  
00014 * Version 1.0.3 2010/11/29 
00015 *    Re-organized the CMSIS folders and updated documentation.  
00016 *   
00017 * Version 1.0.2 2010/11/11  
00018 *    Documentation updated.   
00019 *  
00020 * Version 1.0.1 2010/10/05   
00021 *    Production release and review comments incorporated.  
00022 *  
00023 * Version 1.0.0 2010/09/20   
00024 *    Production release and review comments incorporated  
00025 *  
00026 * Version 0.0.7  2010/06/10   
00027 *    Misra-C changes done  
00028 *  
00029 * -------------------------------------------------------------------------- */ 
00030  
00031 #include "arm_math.h" 
00032  
00033 /**  
00034  * @ingroup groupFilters  
00035  */ 
00036  
00037 /**  
00038  * @defgroup PartialConv Partial Convolution  
00039  *  
00040  * Partial Convolution is equivalent to Convolution except that a subset of the output samples is generated.  
00041  * Each function has two additional arguments.  
00042  * <code>firstIndex</code> specifies the starting index of the subset of output samples.  
00043  * <code>numPoints</code> is the number of output samples to compute.  
00044  * The function computes the output in the range  
00045  * <code>[firstIndex, ..., firstIndex+numPoints-1]</code>.  
00046  * The output array <code>pDst</code> contains <code>numPoints</code> values.  
00047  *  
00048  * The allowable range of output indices is [0 srcALen+srcBLen-2].  
00049  * If the requested subset does not fall in this range then the functions return ARM_MATH_ARGUMENT_ERROR.  
00050  * Otherwise the functions return ARM_MATH_SUCCESS.  
00051  * \note Refer arm_conv_f32() for details on fixed point behavior. 
00052  */ 
00053  
00054 /**  
00055  * @addtogroup PartialConv  
00056  * @{  
00057  */ 
00058  
00059 /**  
00060  * @brief Partial convolution of floating-point sequences.  
00061  * @param[in]       *pSrcA points to the first input sequence.  
00062  * @param[in]       srcALen length of the first input sequence.  
00063  * @param[in]       *pSrcB points to the second input sequence.  
00064  * @param[in]       srcBLen length of the second input sequence.  
00065  * @param[out]      *pDst points to the location where the output result is written.  
00066  * @param[in]       firstIndex is the first output sample to start with.  
00067  * @param[in]       numPoints is the number of output points to be computed.  
00068  * @return  Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].  
00069  */ 
00070  
00071 arm_status arm_conv_partial_f32( 
00072   float32_t * pSrcA, 
00073   uint32_t srcALen, 
00074   float32_t * pSrcB, 
00075   uint32_t srcBLen, 
00076   float32_t * pDst, 
00077   uint32_t firstIndex, 
00078   uint32_t numPoints) 
00079 { 
00080   float32_t *pIn1 = pSrcA;                       /* inputA pointer */ 
00081   float32_t *pIn2 = pSrcB;                       /* inputB pointer */ 
00082   float32_t *pOut = pDst;                        /* output pointer */ 
00083   float32_t *px;                                 /* Intermediate inputA pointer */ 
00084   float32_t *py;                                 /* Intermediate inputB pointer */ 
00085   float32_t *pSrc1, *pSrc2;                      /* Intermediate pointers */ 
00086   float32_t sum, acc0, acc1, acc2, acc3;         /* Accumulator */ 
00087   float32_t x0, x1, x2, x3, c0;                  /* Temporary variables to hold state and coefficient values */ 
00088   uint32_t j, k, count = 0u, blkCnt, check; 
00089   int32_t blockSize1, blockSize2, blockSize3;    /* loop counters */ 
00090   arm_status status;                             /* status of Partial convolution */ 
00091  
00092  
00093   /* Check for range of output samples to be calculated */ 
00094   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 
00095   { 
00096     /* Set status as ARM_MATH_ARGUMENT_ERROR */ 
00097     status = ARM_MATH_ARGUMENT_ERROR; 
00098   } 
00099   else 
00100   { 
00101  
00102     /* The algorithm implementation is based on the lengths of the inputs. */ 
00103     /* srcB is always made to slide across srcA. */ 
00104     /* So srcBLen is always considered as shorter or equal to srcALen */ 
00105     if(srcALen >= srcBLen) 
00106     { 
00107       /* Initialization of inputA pointer */ 
00108       pIn1 = pSrcA; 
00109  
00110       /* Initialization of inputB pointer */ 
00111       pIn2 = pSrcB; 
00112     } 
00113     else 
00114     { 
00115       /* Initialization of inputA pointer */ 
00116       pIn1 = pSrcB; 
00117  
00118       /* Initialization of inputB pointer */ 
00119       pIn2 = pSrcA; 
00120  
00121       /* srcBLen is always considered as shorter or equal to srcALen */ 
00122       j = srcBLen; 
00123       srcBLen = srcALen; 
00124       srcALen = j; 
00125     } 
00126  
00127     /* Conditions to check which loopCounter holds  
00128      * the first and last indices of the output samples to be calculated. */ 
00129     check = firstIndex + numPoints; 
00130     blockSize3 = (int32_t) check - (int32_t) srcALen; 
00131     blockSize3 = (blockSize3 > 0) ? blockSize3 : 0; 
00132     blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex; 
00133     blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 : 
00134                                     (int32_t) numPoints) : 0; 
00135     blockSize2 = ((int32_t) check - blockSize3) -  
00136                  (blockSize1 + (int32_t) firstIndex); 
00137     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; 
00138  
00139     /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 
00140     /* The function is internally  
00141      * divided into three stages according to the number of multiplications that has to be  
00142      * taken place between inputA samples and inputB samples. In the first stage of the  
00143      * algorithm, the multiplications increase by one for every iteration.  
00144      * In the second stage of the algorithm, srcBLen number of multiplications are done.  
00145      * In the third stage of the algorithm, the multiplications decrease by one  
00146      * for every iteration. */ 
00147  
00148     /* Set the output pointer to point to the firstIndex  
00149      * of the output sample to be calculated. */ 
00150     pOut = pDst + firstIndex; 
00151  
00152     /* --------------------------  
00153      * Initializations of stage1  
00154      * -------------------------*/ 
00155  
00156     /* sum = x[0] * y[0]  
00157      * sum = x[0] * y[1] + x[1] * y[0]  
00158      * ....  
00159      * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]  
00160      */ 
00161  
00162     /* In this stage the MAC operations are increased by 1 for every iteration.  
00163        The count variable holds the number of MAC operations performed.  
00164        Since the partial convolution starts from from firstIndex  
00165        Number of Macs to be performed is firstIndex + 1 */ 
00166     count = 1u + firstIndex; 
00167  
00168     /* Working pointer of inputA */ 
00169     px = pIn1; 
00170  
00171     /* Working pointer of inputB */ 
00172     pSrc1 = pIn2 + firstIndex; 
00173     py = pSrc1; 
00174  
00175     /* ------------------------  
00176      * Stage1 process  
00177      * ----------------------*/ 
00178  
00179     /* The first stage starts here */ 
00180     while(blockSize1 > 0) 
00181     { 
00182       /* Accumulator is made zero for every iteration */ 
00183       sum = 0.0f; 
00184  
00185       /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00186       k = count >> 2u; 
00187  
00188       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00189        ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00190       while(k > 0u) 
00191       { 
00192         /* x[0] * y[srcBLen - 1] */ 
00193         sum += *px++ * *py--; 
00194  
00195         /* x[1] * y[srcBLen - 2] */ 
00196         sum += *px++ * *py--; 
00197  
00198         /* x[2] * y[srcBLen - 3] */ 
00199         sum += *px++ * *py--; 
00200  
00201         /* x[3] * y[srcBLen - 4] */ 
00202         sum += *px++ * *py--; 
00203  
00204         /* Decrement the loop counter */ 
00205         k--; 
00206       } 
00207  
00208       /* If the count is not a multiple of 4, compute any remaining MACs here.  
00209        ** No loop unrolling is used. */ 
00210       k = count % 0x4u; 
00211  
00212       while(k > 0u) 
00213       { 
00214         /* Perform the multiply-accumulates */ 
00215         sum += *px++ * *py--; 
00216  
00217         /* Decrement the loop counter */ 
00218         k--; 
00219       } 
00220  
00221       /* Store the result in the accumulator in the destination buffer. */ 
00222       *pOut++ = sum; 
00223  
00224       /* Update the inputA and inputB pointers for next MAC calculation */ 
00225       py = ++pSrc1; 
00226       px = pIn1; 
00227  
00228       /* Increment the MAC count */ 
00229       count++; 
00230  
00231       /* Decrement the loop counter */ 
00232       blockSize1--; 
00233     } 
00234  
00235     /* --------------------------  
00236      * Initializations of stage2  
00237      * ------------------------*/ 
00238  
00239     /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]  
00240      * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]  
00241      * ....  
00242      * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]  
00243      */ 
00244  
00245     /* Working pointer of inputA */ 
00246     px = pIn1; 
00247  
00248     /* Working pointer of inputB */ 
00249     pSrc2 = pIn2 + (srcBLen - 1u); 
00250     py = pSrc2; 
00251  
00252     /* count is index by which the pointer pIn1 to be incremented */ 
00253     count = 1u; 
00254  
00255     /* -------------------  
00256      * Stage2 process  
00257      * ------------------*/ 
00258  
00259     /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.  
00260      * So, to loop unroll over blockSize2,  
00261      * srcBLen should be greater than or equal to 4 */ 
00262     if(srcBLen >= 4u) 
00263     { 
00264       /* Loop unroll over blockSize2, by 4 */ 
00265       blkCnt = ((uint32_t) blockSize2 >> 2u); 
00266  
00267       while(blkCnt > 0u) 
00268       { 
00269         /* Set all accumulators to zero */ 
00270         acc0 = 0.0f; 
00271         acc1 = 0.0f; 
00272         acc2 = 0.0f; 
00273         acc3 = 0.0f; 
00274  
00275         /* read x[0], x[1], x[2] samples */ 
00276         x0 = *(px++); 
00277         x1 = *(px++); 
00278         x2 = *(px++); 
00279  
00280         /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00281         k = srcBLen >> 2u; 
00282  
00283         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00284          ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00285         do 
00286         { 
00287           /* Read y[srcBLen - 1] sample */ 
00288           c0 = *(py--); 
00289  
00290           /* Read x[3] sample */ 
00291           x3 = *(px++); 
00292  
00293           /* Perform the multiply-accumulate */ 
00294           /* acc0 +=  x[0] * y[srcBLen - 1] */ 
00295           acc0 += x0 * c0; 
00296  
00297           /* acc1 +=  x[1] * y[srcBLen - 1] */ 
00298           acc1 += x1 * c0; 
00299  
00300           /* acc2 +=  x[2] * y[srcBLen - 1] */ 
00301           acc2 += x2 * c0; 
00302  
00303           /* acc3 +=  x[3] * y[srcBLen - 1] */ 
00304           acc3 += x3 * c0; 
00305  
00306           /* Read y[srcBLen - 2] sample */ 
00307           c0 = *(py--); 
00308  
00309           /* Read x[4] sample */ 
00310           x0 = *(px++); 
00311  
00312           /* Perform the multiply-accumulate */ 
00313           /* acc0 +=  x[1] * y[srcBLen - 2] */ 
00314           acc0 += x1 * c0; 
00315           /* acc1 +=  x[2] * y[srcBLen - 2] */ 
00316           acc1 += x2 * c0; 
00317           /* acc2 +=  x[3] * y[srcBLen - 2] */ 
00318           acc2 += x3 * c0; 
00319           /* acc3 +=  x[4] * y[srcBLen - 2] */ 
00320           acc3 += x0 * c0; 
00321  
00322           /* Read y[srcBLen - 3] sample */ 
00323           c0 = *(py--); 
00324  
00325           /* Read x[5] sample */ 
00326           x1 = *(px++); 
00327  
00328           /* Perform the multiply-accumulates */ 
00329           /* acc0 +=  x[2] * y[srcBLen - 3] */ 
00330           acc0 += x2 * c0; 
00331           /* acc1 +=  x[3] * y[srcBLen - 2] */ 
00332           acc1 += x3 * c0; 
00333           /* acc2 +=  x[4] * y[srcBLen - 2] */ 
00334           acc2 += x0 * c0; 
00335           /* acc3 +=  x[5] * y[srcBLen - 2] */ 
00336           acc3 += x1 * c0; 
00337  
00338           /* Read y[srcBLen - 4] sample */ 
00339           c0 = *(py--); 
00340  
00341           /* Read x[6] sample */ 
00342           x2 = *(px++); 
00343  
00344           /* Perform the multiply-accumulates */ 
00345           /* acc0 +=  x[3] * y[srcBLen - 4] */ 
00346           acc0 += x3 * c0; 
00347           /* acc1 +=  x[4] * y[srcBLen - 4] */ 
00348           acc1 += x0 * c0; 
00349           /* acc2 +=  x[5] * y[srcBLen - 4] */ 
00350           acc2 += x1 * c0; 
00351           /* acc3 +=  x[6] * y[srcBLen - 4] */ 
00352           acc3 += x2 * c0; 
00353  
00354  
00355         } while(--k); 
00356  
00357         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.  
00358          ** No loop unrolling is used. */ 
00359         k = srcBLen % 0x4u; 
00360  
00361         while(k > 0u) 
00362         { 
00363           /* Read y[srcBLen - 5] sample */ 
00364           c0 = *(py--); 
00365  
00366           /* Read x[7] sample */ 
00367           x3 = *(px++); 
00368  
00369           /* Perform the multiply-accumulates */ 
00370           /* acc0 +=  x[4] * y[srcBLen - 5] */ 
00371           acc0 += x0 * c0; 
00372           /* acc1 +=  x[5] * y[srcBLen - 5] */ 
00373           acc1 += x1 * c0; 
00374           /* acc2 +=  x[6] * y[srcBLen - 5] */ 
00375           acc2 += x2 * c0; 
00376           /* acc3 +=  x[7] * y[srcBLen - 5] */ 
00377           acc3 += x3 * c0; 
00378  
00379           /* Reuse the present samples for the next MAC */ 
00380           x0 = x1; 
00381           x1 = x2; 
00382           x2 = x3; 
00383  
00384           /* Decrement the loop counter */ 
00385           k--; 
00386         } 
00387  
00388         /* Store the result in the accumulator in the destination buffer. */ 
00389         *pOut++ = acc0; 
00390         *pOut++ = acc1; 
00391         *pOut++ = acc2; 
00392         *pOut++ = acc3; 
00393  
00394         /* Update the inputA and inputB pointers for next MAC calculation */ 
00395         px = pIn1 + (count * 4u); 
00396         py = pSrc2; 
00397  
00398         /* Increment the pointer pIn1 index, count by 1 */ 
00399         count++; 
00400  
00401         /* Decrement the loop counter */ 
00402         blkCnt--; 
00403       } 
00404  
00405       /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.  
00406        ** No loop unrolling is used. */ 
00407       blkCnt = (uint32_t) blockSize2 % 0x4u; 
00408  
00409       while(blkCnt > 0u) 
00410       { 
00411         /* Accumulator is made zero for every iteration */ 
00412         sum = 0.0f; 
00413  
00414         /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00415         k = srcBLen >> 2u; 
00416  
00417         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00418          ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00419         while(k > 0u) 
00420         { 
00421           /* Perform the multiply-accumulates */ 
00422           sum += *px++ * *py--; 
00423           sum += *px++ * *py--; 
00424           sum += *px++ * *py--; 
00425           sum += *px++ * *py--; 
00426  
00427           /* Decrement the loop counter */ 
00428           k--; 
00429         } 
00430  
00431         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.  
00432          ** No loop unrolling is used. */ 
00433         k = srcBLen % 0x4u; 
00434  
00435         while(k > 0u) 
00436         { 
00437           /* Perform the multiply-accumulate */ 
00438           sum += *px++ * *py--; 
00439  
00440           /* Decrement the loop counter */ 
00441           k--; 
00442         } 
00443  
00444         /* Store the result in the accumulator in the destination buffer. */ 
00445         *pOut++ = sum; 
00446  
00447         /* Update the inputA and inputB pointers for next MAC calculation */ 
00448         px = pIn1 + count; 
00449         py = pSrc2; 
00450  
00451         /* Increment the MAC count */ 
00452         count++; 
00453  
00454         /* Decrement the loop counter */ 
00455         blkCnt--; 
00456       } 
00457     } 
00458     else 
00459     { 
00460       /* If the srcBLen is not a multiple of 4,  
00461        * the blockSize2 loop cannot be unrolled by 4 */ 
00462       blkCnt = (uint32_t) blockSize2; 
00463  
00464       while(blkCnt > 0u) 
00465       { 
00466         /* Accumulator is made zero for every iteration */ 
00467         sum = 0.0f; 
00468  
00469         /* srcBLen number of MACS should be performed */ 
00470         k = srcBLen; 
00471  
00472         while(k > 0u) 
00473         { 
00474           /* Perform the multiply-accumulate */ 
00475           sum += *px++ * *py--; 
00476  
00477           /* Decrement the loop counter */ 
00478           k--; 
00479         } 
00480  
00481         /* Store the result in the accumulator in the destination buffer. */ 
00482         *pOut++ = sum; 
00483  
00484         /* Update the inputA and inputB pointers for next MAC calculation */ 
00485         px = pIn1 + count; 
00486         py = pSrc2; 
00487  
00488         /* Increment the MAC count */ 
00489         count++; 
00490  
00491         /* Decrement the loop counter */ 
00492         blkCnt--; 
00493       } 
00494     } 
00495  
00496  
00497     /* --------------------------  
00498      * Initializations of stage3  
00499      * -------------------------*/ 
00500  
00501     /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]  
00502      * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]  
00503      * ....  
00504      * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]  
00505      * sum +=  x[srcALen-1] * y[srcBLen-1]  
00506      */ 
00507  
00508     /* In this stage the MAC operations are decreased by 1 for every iteration.  
00509        The count variable holds the number of MAC operations performed */ 
00510     count = srcBLen - 1u; 
00511  
00512     /* Working pointer of inputA */ 
00513     pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 
00514     px = pSrc1; 
00515  
00516     /* Working pointer of inputB */ 
00517     pSrc2 = pIn2 + (srcBLen - 1u); 
00518     py = pSrc2; 
00519  
00520     while(blockSize3 > 0) 
00521     { 
00522       /* Accumulator is made zero for every iteration */ 
00523       sum = 0.0f; 
00524  
00525       /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00526       k = count >> 2u; 
00527  
00528       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00529        ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00530       while(k > 0u) 
00531       { 
00532         /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ 
00533         sum += *px++ * *py--; 
00534  
00535         /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ 
00536         sum += *px++ * *py--; 
00537  
00538         /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ 
00539         sum += *px++ * *py--; 
00540  
00541         /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ 
00542         sum += *px++ * *py--; 
00543  
00544         /* Decrement the loop counter */ 
00545         k--; 
00546       } 
00547  
00548       /* If the count is not a multiple of 4, compute any remaining MACs here.  
00549        ** No loop unrolling is used. */ 
00550       k = count % 0x4u; 
00551  
00552       while(k > 0u) 
00553       { 
00554         /* Perform the multiply-accumulates */ 
00555         /* sum +=  x[srcALen-1] * y[srcBLen-1] */ 
00556         sum += *px++ * *py--; 
00557  
00558         /* Decrement the loop counter */ 
00559         k--; 
00560       } 
00561  
00562       /* Store the result in the accumulator in the destination buffer. */ 
00563       *pOut++ = sum; 
00564  
00565       /* Update the inputA and inputB pointers for next MAC calculation */ 
00566       px = ++pSrc1; 
00567       py = pSrc2; 
00568  
00569       /* Decrement the MAC count */ 
00570       count--; 
00571  
00572       /* Decrement the loop counter */ 
00573       blockSize3--; 
00574  
00575     } 
00576  
00577     /* set status as ARM_MATH_SUCCESS */ 
00578     status = ARM_MATH_SUCCESS; 
00579   } 
00580  
00581   /* Return to application */ 
00582   return (status); 
00583  
00584 } 
00585  
00586 /**  
00587  * @} end of PartialConv group  
00588  */