CMSIS DSP Library from CMSIS 2.0. See http://www.onarm.com/cmsis/ for full details

Dependents:   K22F_DSP_Matrix_least_square BNO055-ELEC3810 1BNO055 ECE4180Project--Slave2 ... more

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_conv_f32.c Source File

arm_conv_f32.c

00001 /* ----------------------------------------------------------------------------  
00002 * Copyright (C) 2010 ARM Limited. All rights reserved.  
00003 *  
00004 * $Date:        29. November 2010  
00005 * $Revision:    V1.0.3  
00006 *  
00007 * Project:      CMSIS DSP Library  
00008 * Title:        arm_conv_f32.c  
00009 *  
00010 * Description:  Convolution of floating-point sequences.  
00011 *  
00012 * Target Processor: Cortex-M4/Cortex-M3
00013 *  
00014 * Version 1.0.3 2010/11/29 
00015 *    Re-organized the CMSIS folders and updated documentation.  
00016 *   
00017 * Version 1.0.2 2010/11/11  
00018 *    Documentation updated.   
00019 *  
00020 * Version 1.0.1 2010/10/05   
00021 *    Production release and review comments incorporated.  
00022 *  
00023 * Version 1.0.0 2010/09/20   
00024 *    Production release and review comments incorporated  
00025 *  
00026 * Version 0.0.7  2010/06/10   
00027 *    Misra-C changes done  
00028 *  
00029 * -------------------------------------------------------------------------- */ 
00030  
00031 #include "arm_math.h" 
00032  
00033 /**  
00034  * @ingroup groupFilters  
00035  */ 
00036  
00037 /**  
00038  * @defgroup Conv Convolution  
00039  *  
00040  * Convolution is a mathematical operation that operates on two finite length vectors to generate a finite length output vector.  
00041  * Convolution is similar to correlation and is frequently used in filtering and data analysis.  
00042  * The CMSIS DSP library contains functions for convolving Q7, Q15, Q31, and floating-point data types.  
00043  * The library also provides fast versions of the Q15 and Q31 functions.  
00044  *  
00045  * \par Algorithm  
00046  * Let <code>a[n]</code> and <code>b[n]</code> be sequences of length <code>srcALen</code> and <code>srcBLen</code> samples respectively.  
00047  * Then the convolution  
00048  *  
00049  * <pre>  
00050  *                   c[n] = a[n] * b[n]  
00051  * </pre>  
00052  *  
00053  * \par  
00054  * is defined as  
00055  * \image html ConvolutionEquation.gif  
00056  * \par  
00057  * Note that <code>c[n]</code> is of length <code>srcALen + srcBLen - 1</code> and is defined over the interval <code>n=0, 1, 2, ..., srcALen + srcBLen - 2</code>.  
00058  * <code>pSrcA</code> points to the first input vector of length <code>srcALen</code> and  
00059  * <code>pSrcB</code> points to the second input vector of length <code>srcBLen</code>.  
00060  * The output result is written to <code>pDst</code> and the calling function must allocate <code>srcALen+srcBLen-1</code> words for the result.  
00061  *  
00062  * \par  
00063  * Conceptually, when two signals <code>a[n]</code> and <code>b[n]</code> are convolved,  
00064  * the signal <code>b[n]</code> slides over <code>a[n]</code>.  
00065  * For each offset \c n, the overlapping portions of a[n] and b[n] are multiplied and summed together.  
00066  *  
00067  * \par  
00068  * Note that convolution is a commutative operation:  
00069  *  
00070  * <pre>  
00071  *                   a[n] * b[n] = b[n] * a[n].  
00072  * </pre>  
00073  *  
00074  * \par  
00075  * This means that switching the A and B arguments to the convolution functions has no effect.  
00076  *  
00077  * <b>Fixed-Point Behavior</b>  
00078  *  
00079  * \par  
00080  * Convolution requires summing up a large number of intermediate products.  
00081  * As such, the Q7, Q15, and Q31 functions run a risk of overflow and saturation.  
00082  * Refer to the function specific documentation below for further details of the particular algorithm used.  
00083  */ 
00084  
00085 /**  
00086  * @addtogroup Conv  
00087  * @{  
00088  */ 
00089  
00090 /**  
00091  * @brief Convolution of floating-point sequences.  
00092  * @param[in] *pSrcA points to the first input sequence.  
00093  * @param[in] srcALen length of the first input sequence.  
00094  * @param[in] *pSrcB points to the second input sequence.  
00095  * @param[in] srcBLen length of the second input sequence.  
00096  * @param[out] *pDst points to the location where the output result is written.  Length srcALen+srcBLen-1.  
00097  * @return none.  
00098  */ 
00099  
00100 void arm_conv_f32( 
00101   float32_t * pSrcA, 
00102   uint32_t srcALen, 
00103   float32_t * pSrcB, 
00104   uint32_t srcBLen, 
00105   float32_t * pDst) 
00106 { 
00107   float32_t *pIn1;                               /* inputA pointer */ 
00108   float32_t *pIn2;                               /* inputB pointer */ 
00109   float32_t *pOut = pDst;                        /* output pointer */ 
00110   float32_t *px;                                 /* Intermediate inputA pointer */ 
00111   float32_t *py;                                 /* Intermediate inputB pointer */ 
00112   float32_t *pSrc1, *pSrc2;                      /* Intermediate pointers */ 
00113   float32_t sum, acc0, acc1, acc2, acc3;         /* Accumulator */ 
00114   float32_t x0, x1, x2, x3, c0;                  /* Temporary variables to hold state and coefficient values */ 
00115   uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3;     /* loop counters */ 
00116  
00117  
00118   /* The algorithm implementation is based on the lengths of the inputs. */ 
00119   /* srcB is always made to slide across srcA. */ 
00120   /* So srcBLen is always considered as shorter or equal to srcALen */ 
00121   if(srcALen >= srcBLen) 
00122   { 
00123     /* Initialization of inputA pointer */ 
00124     pIn1 = pSrcA; 
00125  
00126     /* Initialization of inputB pointer */ 
00127     pIn2 = pSrcB; 
00128   } 
00129   else 
00130   { 
00131     /* Initialization of inputA pointer */ 
00132     pIn1 = pSrcB; 
00133  
00134     /* Initialization of inputB pointer */ 
00135     pIn2 = pSrcA; 
00136  
00137     /* srcBLen is always considered as shorter or equal to srcALen */ 
00138     j = srcBLen; 
00139     srcBLen = srcALen; 
00140     srcALen = j; 
00141   } 
00142  
00143   /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 
00144   /* The function is internally  
00145    * divided into three stages according to the number of multiplications that has to be  
00146    * taken place between inputA samples and inputB samples. In the first stage of the  
00147    * algorithm, the multiplications increase by one for every iteration.  
00148    * In the second stage of the algorithm, srcBLen number of multiplications are done.  
00149    * In the third stage of the algorithm, the multiplications decrease by one  
00150    * for every iteration. */ 
00151  
00152   /* The algorithm is implemented in three stages.  
00153      The loop counters of each stage is initiated here. */ 
00154   blockSize1 = srcBLen - 1u; 
00155   blockSize2 = srcALen - (srcBLen - 1u); 
00156   blockSize3 = blockSize1; 
00157  
00158   /* --------------------------  
00159    * initializations of stage1  
00160    * -------------------------*/ 
00161  
00162   /* sum = x[0] * y[0]  
00163    * sum = x[0] * y[1] + x[1] * y[0]  
00164    * ....  
00165    * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]  
00166    */ 
00167  
00168   /* In this stage the MAC operations are increased by 1 for every iteration.  
00169      The count variable holds the number of MAC operations performed */ 
00170   count = 1u; 
00171  
00172   /* Working pointer of inputA */ 
00173   px = pIn1; 
00174  
00175   /* Working pointer of inputB */ 
00176   py = pIn2; 
00177  
00178  
00179   /* ------------------------  
00180    * Stage1 process  
00181    * ----------------------*/ 
00182  
00183   /* The first stage starts here */ 
00184   while(blockSize1 > 0u) 
00185   { 
00186     /* Accumulator is made zero for every iteration */ 
00187     sum = 0.0f; 
00188  
00189     /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00190     k = count >> 2u; 
00191  
00192     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00193      ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00194     while(k > 0u) 
00195     { 
00196       /* x[0] * y[srcBLen - 1] */ 
00197       sum += *px++ * *py--; 
00198  
00199       /* x[1] * y[srcBLen - 2] */ 
00200       sum += *px++ * *py--; 
00201  
00202       /* x[2] * y[srcBLen - 3] */ 
00203       sum += *px++ * *py--; 
00204  
00205       /* x[3] * y[srcBLen - 4] */ 
00206       sum += *px++ * *py--; 
00207  
00208       /* Decrement the loop counter */ 
00209       k--; 
00210     } 
00211  
00212     /* If the count is not a multiple of 4, compute any remaining MACs here.  
00213      ** No loop unrolling is used. */ 
00214     k = count % 0x4u; 
00215  
00216     while(k > 0u) 
00217     { 
00218       /* Perform the multiply-accumulate */ 
00219       sum += *px++ * *py--; 
00220  
00221       /* Decrement the loop counter */ 
00222       k--; 
00223     } 
00224  
00225     /* Store the result in the accumulator in the destination buffer. */ 
00226     *pOut++ = sum; 
00227  
00228     /* Update the inputA and inputB pointers for next MAC calculation */ 
00229     py = pIn2 + count; 
00230     px = pIn1; 
00231  
00232     /* Increment the MAC count */ 
00233     count++; 
00234  
00235     /* Decrement the loop counter */ 
00236     blockSize1--; 
00237   } 
00238  
00239   /* --------------------------  
00240    * Initializations of stage2  
00241    * ------------------------*/ 
00242  
00243   /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]  
00244    * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]  
00245    * ....  
00246    * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]  
00247    */ 
00248  
00249   /* Working pointer of inputA */ 
00250   px = pIn1; 
00251  
00252   /* Working pointer of inputB */ 
00253   pSrc2 = pIn2 + (srcBLen - 1u); 
00254   py = pSrc2; 
00255  
00256   /* count is index by which the pointer pIn1 to be incremented */ 
00257   count = 1u; 
00258  
00259   /* -------------------  
00260    * Stage2 process  
00261    * ------------------*/ 
00262  
00263   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.  
00264    * So, to loop unroll over blockSize2,  
00265    * srcBLen should be greater than or equal to 4 */ 
00266   if(srcBLen >= 4u) 
00267   { 
00268     /* Loop unroll over blockSize2, by 4 */ 
00269     blkCnt = blockSize2 >> 2u; 
00270  
00271     while(blkCnt > 0u) 
00272     { 
00273       /* Set all accumulators to zero */ 
00274       acc0 = 0.0f; 
00275       acc1 = 0.0f; 
00276       acc2 = 0.0f; 
00277       acc3 = 0.0f; 
00278  
00279       /* read x[0], x[1], x[2] samples */ 
00280       x0 = *(px++); 
00281       x1 = *(px++); 
00282       x2 = *(px++); 
00283  
00284       /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00285       k = srcBLen >> 2u; 
00286  
00287       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00288        ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00289       do 
00290       { 
00291         /* Read y[srcBLen - 1] sample */ 
00292         c0 = *(py--); 
00293  
00294         /* Read x[3] sample */ 
00295         x3 = *(px++); 
00296  
00297         /* Perform the multiply-accumulate */ 
00298         /* acc0 +=  x[0] * y[srcBLen - 1] */ 
00299         acc0 += x0 * c0; 
00300  
00301         /* acc1 +=  x[1] * y[srcBLen - 1] */ 
00302         acc1 += x1 * c0; 
00303  
00304         /* acc2 +=  x[2] * y[srcBLen - 1] */ 
00305         acc2 += x2 * c0; 
00306  
00307         /* acc3 +=  x[3] * y[srcBLen - 1] */ 
00308         acc3 += x3 * c0; 
00309  
00310         /* Read y[srcBLen - 2] sample */ 
00311         c0 = *(py--); 
00312  
00313         /* Read x[4] sample */ 
00314         x0 = *(px++); 
00315  
00316         /* Perform the multiply-accumulate */ 
00317         /* acc0 +=  x[1] * y[srcBLen - 2] */ 
00318         acc0 += x1 * c0; 
00319         /* acc1 +=  x[2] * y[srcBLen - 2] */ 
00320         acc1 += x2 * c0; 
00321         /* acc2 +=  x[3] * y[srcBLen - 2] */ 
00322         acc2 += x3 * c0; 
00323         /* acc3 +=  x[4] * y[srcBLen - 2] */ 
00324         acc3 += x0 * c0; 
00325  
00326         /* Read y[srcBLen - 3] sample */ 
00327         c0 = *(py--); 
00328  
00329         /* Read x[5] sample */ 
00330         x1 = *(px++); 
00331  
00332         /* Perform the multiply-accumulates */ 
00333         /* acc0 +=  x[2] * y[srcBLen - 3] */ 
00334         acc0 += x2 * c0; 
00335         /* acc1 +=  x[3] * y[srcBLen - 2] */ 
00336         acc1 += x3 * c0; 
00337         /* acc2 +=  x[4] * y[srcBLen - 2] */ 
00338         acc2 += x0 * c0; 
00339         /* acc3 +=  x[5] * y[srcBLen - 2] */ 
00340         acc3 += x1 * c0; 
00341  
00342         /* Read y[srcBLen - 4] sample */ 
00343         c0 = *(py--); 
00344  
00345         /* Read x[6] sample */ 
00346         x2 = *(px++); 
00347  
00348         /* Perform the multiply-accumulates */ 
00349         /* acc0 +=  x[3] * y[srcBLen - 4] */ 
00350         acc0 += x3 * c0; 
00351         /* acc1 +=  x[4] * y[srcBLen - 4] */ 
00352         acc1 += x0 * c0; 
00353         /* acc2 +=  x[5] * y[srcBLen - 4] */ 
00354         acc2 += x1 * c0; 
00355         /* acc3 +=  x[6] * y[srcBLen - 4] */ 
00356         acc3 += x2 * c0; 
00357  
00358  
00359       } while(--k); 
00360  
00361       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.  
00362        ** No loop unrolling is used. */ 
00363       k = srcBLen % 0x4u; 
00364  
00365       while(k > 0u) 
00366       { 
00367         /* Read y[srcBLen - 5] sample */ 
00368         c0 = *(py--); 
00369  
00370         /* Read x[7] sample */ 
00371         x3 = *(px++); 
00372  
00373         /* Perform the multiply-accumulates */ 
00374         /* acc0 +=  x[4] * y[srcBLen - 5] */ 
00375         acc0 += x0 * c0; 
00376         /* acc1 +=  x[5] * y[srcBLen - 5] */ 
00377         acc1 += x1 * c0; 
00378         /* acc2 +=  x[6] * y[srcBLen - 5] */ 
00379         acc2 += x2 * c0; 
00380         /* acc3 +=  x[7] * y[srcBLen - 5] */ 
00381         acc3 += x3 * c0; 
00382  
00383         /* Reuse the present samples for the next MAC */ 
00384         x0 = x1; 
00385         x1 = x2; 
00386         x2 = x3; 
00387  
00388         /* Decrement the loop counter */ 
00389         k--; 
00390       } 
00391  
00392       /* Store the result in the accumulator in the destination buffer. */ 
00393       *pOut++ = acc0; 
00394       *pOut++ = acc1; 
00395       *pOut++ = acc2; 
00396       *pOut++ = acc3; 
00397  
00398       /* Update the inputA and inputB pointers for next MAC calculation */ 
00399       px = pIn1 + (count * 4u); 
00400       py = pSrc2; 
00401  
00402       /* Increment the pointer pIn1 index, count by 1 */ 
00403       count++; 
00404  
00405       /* Decrement the loop counter */ 
00406       blkCnt--; 
00407     } 
00408  
00409     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.  
00410      ** No loop unrolling is used. */ 
00411     blkCnt = blockSize2 % 0x4u; 
00412  
00413     while(blkCnt > 0u) 
00414     { 
00415       /* Accumulator is made zero for every iteration */ 
00416       sum = 0.0f; 
00417  
00418       /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00419       k = srcBLen >> 2u; 
00420  
00421       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00422        ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00423       while(k > 0u) 
00424       { 
00425         /* Perform the multiply-accumulates */ 
00426         sum += *px++ * *py--; 
00427         sum += *px++ * *py--; 
00428         sum += *px++ * *py--; 
00429         sum += *px++ * *py--; 
00430  
00431         /* Decrement the loop counter */ 
00432         k--; 
00433       } 
00434  
00435       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.  
00436        ** No loop unrolling is used. */ 
00437       k = srcBLen % 0x4u; 
00438  
00439       while(k > 0u) 
00440       { 
00441         /* Perform the multiply-accumulate */ 
00442         sum += *px++ * *py--; 
00443  
00444         /* Decrement the loop counter */ 
00445         k--; 
00446       } 
00447  
00448       /* Store the result in the accumulator in the destination buffer. */ 
00449       *pOut++ = sum; 
00450  
00451       /* Update the inputA and inputB pointers for next MAC calculation */ 
00452       px = pIn1 + count; 
00453       py = pSrc2; 
00454  
00455       /* Increment the MAC count */ 
00456       count++; 
00457  
00458       /* Decrement the loop counter */ 
00459       blkCnt--; 
00460     } 
00461   } 
00462   else 
00463   { 
00464     /* If the srcBLen is not a multiple of 4,  
00465      * the blockSize2 loop cannot be unrolled by 4 */ 
00466     blkCnt = blockSize2; 
00467  
00468     while(blkCnt > 0u) 
00469     { 
00470       /* Accumulator is made zero for every iteration */ 
00471       sum = 0.0f; 
00472  
00473       /* srcBLen number of MACS should be performed */ 
00474       k = srcBLen; 
00475  
00476       while(k > 0u) 
00477       { 
00478         /* Perform the multiply-accumulate */ 
00479         sum += *px++ * *py--; 
00480  
00481         /* Decrement the loop counter */ 
00482         k--; 
00483       } 
00484  
00485       /* Store the result in the accumulator in the destination buffer. */ 
00486       *pOut++ = sum; 
00487  
00488       /* Update the inputA and inputB pointers for next MAC calculation */ 
00489       px = pIn1 + count; 
00490       py = pSrc2; 
00491  
00492       /* Increment the MAC count */ 
00493       count++; 
00494  
00495       /* Decrement the loop counter */ 
00496       blkCnt--; 
00497     } 
00498   } 
00499  
00500  
00501   /* --------------------------  
00502    * Initializations of stage3  
00503    * -------------------------*/ 
00504  
00505   /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]  
00506    * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]  
00507    * ....  
00508    * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]  
00509    * sum +=  x[srcALen-1] * y[srcBLen-1]  
00510    */ 
00511  
00512   /* In this stage the MAC operations are decreased by 1 for every iteration.  
00513      The blockSize3 variable holds the number of MAC operations performed */ 
00514  
00515   /* Working pointer of inputA */ 
00516   pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 
00517   px = pSrc1; 
00518  
00519   /* Working pointer of inputB */ 
00520   pSrc2 = pIn2 + (srcBLen - 1u); 
00521   py = pSrc2; 
00522  
00523   /* -------------------  
00524    * Stage3 process  
00525    * ------------------*/ 
00526  
00527   while(blockSize3 > 0u) 
00528   { 
00529     /* Accumulator is made zero for every iteration */ 
00530     sum = 0.0f; 
00531  
00532     /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00533     k = blockSize3 >> 2u; 
00534  
00535     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00536      ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00537     while(k > 0u) 
00538     { 
00539       /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ 
00540       sum += *px++ * *py--; 
00541  
00542       /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ 
00543       sum += *px++ * *py--; 
00544  
00545       /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ 
00546       sum += *px++ * *py--; 
00547  
00548       /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ 
00549       sum += *px++ * *py--; 
00550  
00551       /* Decrement the loop counter */ 
00552       k--; 
00553     } 
00554  
00555     /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.  
00556      ** No loop unrolling is used. */ 
00557     k = blockSize3 % 0x4u; 
00558  
00559     while(k > 0u) 
00560     { 
00561       /* Perform the multiply-accumulates */ 
00562       /* sum +=  x[srcALen-1] * y[srcBLen-1] */ 
00563       sum += *px++ * *py--; 
00564  
00565       /* Decrement the loop counter */ 
00566       k--; 
00567     } 
00568  
00569     /* Store the result in the accumulator in the destination buffer. */ 
00570     *pOut++ = sum; 
00571  
00572     /* Update the inputA and inputB pointers for next MAC calculation */ 
00573     px = ++pSrc1; 
00574     py = pSrc2; 
00575  
00576     /* Decrement the loop counter */ 
00577     blockSize3--; 
00578   } 
00579  
00580 } 
00581  
00582 /**  
00583  * @} end of Conv group  
00584  */