CMSIS DSP Library from CMSIS 2.0. See http://www.onarm.com/cmsis/ for full details

Dependents:   K22F_DSP_Matrix_least_square BNO055-ELEC3810 1BNO055 ECE4180Project--Slave2 ... more

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_conv_q31.c Source File

arm_conv_q31.c

00001 /* ----------------------------------------------------------------------  
00002 * Copyright (C) 2010 ARM Limited. All rights reserved.  
00003 *  
00004 * $Date:        29. November 2010  
00005 * $Revision:    V1.0.3  
00006 *  
00007 * Project:      CMSIS DSP Library  
00008 * Title:        arm_conv_q31.c  
00009 *  
00010 * Description:  Q31 Convolution.  
00011 *  
00012 * Target Processor: Cortex-M4/Cortex-M3
00013 *  
00014 * Version 1.0.3 2010/11/29 
00015 *    Re-organized the CMSIS folders and updated documentation.  
00016 *   
00017 * Version 1.0.2 2010/11/11  
00018 *    Documentation updated.   
00019 *  
00020 * Version 1.0.1 2010/10/05   
00021 *    Production release and review comments incorporated.  
00022 *  
00023 * Version 1.0.0 2010/09/20   
00024 *    Production release and review comments incorporated  
00025 *  
00026 * Version 0.0.7  2010/06/10   
00027 *    Misra-C changes done  
00028 *  
00029 * -------------------------------------------------------------------- */ 
00030  
00031 #include "arm_math.h" 
00032  
00033 /**  
00034  * @ingroup groupFilters  
00035  */ 
00036  
00037 /**  
00038  * @addtogroup Conv  
00039  * @{  
00040  */ 
00041  
00042 /**  
00043  * @brief Convolution of Q31 sequences.  
00044  * @param[in] *pSrcA points to the first input sequence.  
00045  * @param[in] srcALen length of the first input sequence.  
00046  * @param[in] *pSrcB points to the second input sequence.  
00047  * @param[in] srcBLen length of the second input sequence.  
00048  * @param[out] *pDst points to the location where the output result is written.  Length srcALen+srcBLen-1.  
00049  * @return none.  
00050  *  
00051  * @details  
00052  * <b>Scaling and Overflow Behavior:</b>  
00053  *  
00054  * \par  
00055  * The function is implemented using an internal 64-bit accumulator.  
00056  * The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.  
00057  * There is no saturation on intermediate additions.  
00058  * Thus, if the accumulator overflows it wraps around and distorts the result.  
00059  * The input signals should be scaled down to avoid intermediate overflows.  
00060  * Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows,  
00061  * as maximum of min(srcALen, srcBLen) number of additions are carried internally.  
00062  * The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.  
00063  *  
00064  * \par  
00065  * See <code>arm_conv_fast_q31()</code> for a faster but less precise implementation of this function.  
00066  */ 
00067  
00068 void arm_conv_q31( 
00069   q31_t * pSrcA, 
00070   uint32_t srcALen, 
00071   q31_t * pSrcB, 
00072   uint32_t srcBLen, 
00073   q31_t * pDst) 
00074 { 
00075   q31_t *pIn1;                                   /* inputA pointer */ 
00076   q31_t *pIn2;                                   /* inputB pointer */ 
00077   q31_t *pOut = pDst;                            /* output pointer */ 
00078   q31_t *px;                                     /* Intermediate inputA pointer  */ 
00079   q31_t *py;                                     /* Intermediate inputB pointer  */ 
00080   q31_t *pSrc1, *pSrc2;                          /* Intermediate pointers */ 
00081   q63_t sum;                                     /* Accumulator */ 
00082   q63_t acc0, acc1, acc2, acc3;                  /* Accumulator */ 
00083   q31_t x0, x1, x2, x3, c0;                      /* Temporary variables to hold state and coefficient values */ 
00084   uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3;     /* loop counter */ 
00085  
00086  
00087   /* The algorithm implementation is based on the lengths of the inputs. */ 
00088   /* srcB is always made to slide across srcA. */ 
00089   /* So srcBLen is always considered as shorter or equal to srcALen */ 
00090   if(srcALen >= srcBLen) 
00091   { 
00092     /* Initialization of inputA pointer */ 
00093     pIn1 = pSrcA; 
00094  
00095     /* Initialization of inputB pointer */ 
00096     pIn2 = pSrcB; 
00097   } 
00098   else 
00099   { 
00100     /* Initialization of inputA pointer */ 
00101     pIn1 = (q31_t *) pSrcB; 
00102  
00103     /* Initialization of inputB pointer */ 
00104     pIn2 = (q31_t *) pSrcA; 
00105  
00106     /* srcBLen is always considered as shorter or equal to srcALen */ 
00107     j = srcBLen; 
00108     srcBLen = srcALen; 
00109     srcALen = j; 
00110   } 
00111  
00112   /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 
00113   /* The function is internally  
00114    * divided into three stages according to the number of multiplications that has to be  
00115    * taken place between inputA samples and inputB samples. In the first stage of the  
00116    * algorithm, the multiplications increase by one for every iteration.  
00117    * In the second stage of the algorithm, srcBLen number of multiplications are done.  
00118    * In the third stage of the algorithm, the multiplications decrease by one  
00119    * for every iteration. */ 
00120  
00121   /* The algorithm is implemented in three stages.  
00122      The loop counters of each stage is initiated here. */ 
00123   blockSize1 = srcBLen - 1u; 
00124   blockSize2 = srcALen - (srcBLen - 1u); 
00125   blockSize3 = blockSize1; 
00126  
00127   /* --------------------------  
00128    * Initializations of stage1  
00129    * -------------------------*/ 
00130  
00131   /* sum = x[0] * y[0]  
00132    * sum = x[0] * y[1] + x[1] * y[0]  
00133    * ....  
00134    * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]  
00135    */ 
00136  
00137   /* In this stage the MAC operations are increased by 1 for every iteration.  
00138      The count variable holds the number of MAC operations performed */ 
00139   count = 1u; 
00140  
00141   /* Working pointer of inputA */ 
00142   px = pIn1; 
00143  
00144   /* Working pointer of inputB */ 
00145   py = pIn2; 
00146  
00147  
00148   /* ------------------------  
00149    * Stage1 process  
00150    * ----------------------*/ 
00151  
00152   /* The first stage starts here */ 
00153   while(blockSize1 > 0u) 
00154   { 
00155     /* Accumulator is made zero for every iteration */ 
00156     sum = 0; 
00157  
00158     /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00159     k = count >> 2u; 
00160  
00161     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00162      ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00163     while(k > 0u) 
00164     { 
00165       /* x[0] * y[srcBLen - 1] */ 
00166       sum += (q63_t) * px++ * (*py--); 
00167       /* x[1] * y[srcBLen - 2] */ 
00168       sum += (q63_t) * px++ * (*py--); 
00169       /* x[2] * y[srcBLen - 3] */ 
00170       sum += (q63_t) * px++ * (*py--); 
00171       /* x[3] * y[srcBLen - 4] */ 
00172       sum += (q63_t) * px++ * (*py--); 
00173  
00174       /* Decrement the loop counter */ 
00175       k--; 
00176     } 
00177  
00178     /* If the count is not a multiple of 4, compute any remaining MACs here.  
00179      ** No loop unrolling is used. */ 
00180     k = count % 0x4u; 
00181  
00182     while(k > 0u) 
00183     { 
00184       /* Perform the multiply-accumulate */ 
00185       sum += (q63_t) * px++ * (*py--); 
00186  
00187       /* Decrement the loop counter */ 
00188       k--; 
00189     } 
00190  
00191     /* Store the result in the accumulator in the destination buffer. */ 
00192     *pOut++ = (q31_t) (sum >> 31); 
00193  
00194     /* Update the inputA and inputB pointers for next MAC calculation */ 
00195     py = pIn2 + count; 
00196     px = pIn1; 
00197  
00198     /* Increment the MAC count */ 
00199     count++; 
00200  
00201     /* Decrement the loop counter */ 
00202     blockSize1--; 
00203   } 
00204  
00205   /* --------------------------  
00206    * Initializations of stage2  
00207    * ------------------------*/ 
00208  
00209   /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]  
00210    * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]  
00211    * ....  
00212    * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]  
00213    */ 
00214  
00215   /* Working pointer of inputA */ 
00216   px = pIn1; 
00217  
00218   /* Working pointer of inputB */ 
00219   pSrc2 = pIn2 + (srcBLen - 1u); 
00220   py = pSrc2; 
00221  
00222   /* count is index by which the pointer pIn1 to be incremented */ 
00223   count = 1u; 
00224  
00225   /* -------------------  
00226    * Stage2 process  
00227    * ------------------*/ 
00228  
00229   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.  
00230    * So, to loop unroll over blockSize2,  
00231    * srcBLen should be greater than or equal to 4 */ 
00232   if(srcBLen >= 4u) 
00233   { 
00234     /* Loop unroll over blockSize2, by 4 */ 
00235     blkCnt = blockSize2 >> 2u; 
00236  
00237     while(blkCnt > 0u) 
00238     { 
00239       /* Set all accumulators to zero */ 
00240       acc0 = 0; 
00241       acc1 = 0; 
00242       acc2 = 0; 
00243       acc3 = 0; 
00244  
00245       /* read x[0], x[1], x[2] samples */ 
00246       x0 = *(px++); 
00247       x1 = *(px++); 
00248       x2 = *(px++); 
00249  
00250       /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00251       k = srcBLen >> 2u; 
00252  
00253       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00254        ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00255       do 
00256       { 
00257         /* Read y[srcBLen - 1] sample */ 
00258         c0 = *(py--); 
00259  
00260         /* Read x[3] sample */ 
00261         x3 = *(px++); 
00262  
00263         /* Perform the multiply-accumulates */ 
00264         /* acc0 +=  x[0] * y[srcBLen - 1] */ 
00265         acc0 += ((q63_t) x0 * c0); 
00266         /* acc1 +=  x[1] * y[srcBLen - 1] */ 
00267         acc1 += ((q63_t) x1 * c0); 
00268         /* acc2 +=  x[2] * y[srcBLen - 1] */ 
00269         acc2 += ((q63_t) x2 * c0); 
00270         /* acc3 +=  x[3] * y[srcBLen - 1] */ 
00271         acc3 += ((q63_t) x3 * c0); 
00272  
00273         /* Read y[srcBLen - 2] sample */ 
00274         c0 = *(py--); 
00275  
00276         /* Read x[4] sample */ 
00277         x0 = *(px++); 
00278  
00279         /* Perform the multiply-accumulate */ 
00280         /* acc0 +=  x[1] * y[srcBLen - 2] */ 
00281         acc0 += ((q63_t) x1 * c0); 
00282         /* acc1 +=  x[2] * y[srcBLen - 2] */ 
00283         acc1 += ((q63_t) x2 * c0); 
00284         /* acc2 +=  x[3] * y[srcBLen - 2] */ 
00285         acc2 += ((q63_t) x3 * c0); 
00286         /* acc3 +=  x[4] * y[srcBLen - 2] */ 
00287         acc3 += ((q63_t) x0 * c0); 
00288  
00289         /* Read y[srcBLen - 3] sample */ 
00290         c0 = *(py--); 
00291  
00292         /* Read x[5] sample */ 
00293         x1 = *(px++); 
00294  
00295         /* Perform the multiply-accumulates */ 
00296         /* acc0 +=  x[2] * y[srcBLen - 3] */ 
00297         acc0 += ((q63_t) x2 * c0); 
00298         /* acc1 +=  x[3] * y[srcBLen - 2] */ 
00299         acc1 += ((q63_t) x3 * c0); 
00300         /* acc2 +=  x[4] * y[srcBLen - 2] */ 
00301         acc2 += ((q63_t) x0 * c0); 
00302         /* acc3 +=  x[5] * y[srcBLen - 2] */ 
00303         acc3 += ((q63_t) x1 * c0); 
00304  
00305         /* Read y[srcBLen - 4] sample */ 
00306         c0 = *(py--); 
00307  
00308         /* Read x[6] sample */ 
00309         x2 = *(px++); 
00310  
00311         /* Perform the multiply-accumulates */ 
00312         /* acc0 +=  x[3] * y[srcBLen - 4] */ 
00313         acc0 += ((q63_t) x3 * c0); 
00314         /* acc1 +=  x[4] * y[srcBLen - 4] */ 
00315         acc1 += ((q63_t) x0 * c0); 
00316         /* acc2 +=  x[5] * y[srcBLen - 4] */ 
00317         acc2 += ((q63_t) x1 * c0); 
00318         /* acc3 +=  x[6] * y[srcBLen - 4] */ 
00319         acc3 += ((q63_t) x2 * c0); 
00320  
00321       } while(--k); 
00322  
00323       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.  
00324        ** No loop unrolling is used. */ 
00325       k = srcBLen % 0x4u; 
00326  
00327       while(k > 0u) 
00328       { 
00329         /* Read y[srcBLen - 5] sample */ 
00330         c0 = *(py--); 
00331  
00332         /* Read x[7] sample */ 
00333         x3 = *(px++); 
00334  
00335         /* Perform the multiply-accumulates */ 
00336         /* acc0 +=  x[4] * y[srcBLen - 5] */ 
00337         acc0 += ((q63_t) x0 * c0); 
00338         /* acc1 +=  x[5] * y[srcBLen - 5] */ 
00339         acc1 += ((q63_t) x1 * c0); 
00340         /* acc2 +=  x[6] * y[srcBLen - 5] */ 
00341         acc2 += ((q63_t) x2 * c0); 
00342         /* acc3 +=  x[7] * y[srcBLen - 5] */ 
00343         acc3 += ((q63_t) x3 * c0); 
00344  
00345         /* Reuse the present samples for the next MAC */ 
00346         x0 = x1; 
00347         x1 = x2; 
00348         x2 = x3; 
00349  
00350         /* Decrement the loop counter */ 
00351         k--; 
00352       } 
00353  
00354       /* Store the results in the accumulators in the destination buffer. */ 
00355       *pOut++ = (q31_t) (acc0 >> 31); 
00356       *pOut++ = (q31_t) (acc1 >> 31); 
00357       *pOut++ = (q31_t) (acc2 >> 31); 
00358       *pOut++ = (q31_t) (acc3 >> 31); 
00359  
00360       /* Update the inputA and inputB pointers for next MAC calculation */ 
00361       px = pIn1 + (count * 4u); 
00362       py = pSrc2; 
00363  
00364       /* Increment the pointer pIn1 index, count by 1 */ 
00365       count++; 
00366  
00367       /* Decrement the loop counter */ 
00368       blkCnt--; 
00369     } 
00370  
00371     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.  
00372      ** No loop unrolling is used. */ 
00373     blkCnt = blockSize2 % 0x4u; 
00374  
00375     while(blkCnt > 0u) 
00376     { 
00377       /* Accumulator is made zero for every iteration */ 
00378       sum = 0; 
00379  
00380       /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00381       k = srcBLen >> 2u; 
00382  
00383       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00384        ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00385       while(k > 0u) 
00386       { 
00387         /* Perform the multiply-accumulates */ 
00388         sum += (q63_t) * px++ * (*py--); 
00389         sum += (q63_t) * px++ * (*py--); 
00390         sum += (q63_t) * px++ * (*py--); 
00391         sum += (q63_t) * px++ * (*py--); 
00392  
00393         /* Decrement the loop counter */ 
00394         k--; 
00395       } 
00396  
00397       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.  
00398        ** No loop unrolling is used. */ 
00399       k = srcBLen % 0x4u; 
00400  
00401       while(k > 0u) 
00402       { 
00403         /* Perform the multiply-accumulate */ 
00404         sum += (q63_t) * px++ * (*py--); 
00405  
00406         /* Decrement the loop counter */ 
00407         k--; 
00408       } 
00409  
00410       /* Store the result in the accumulator in the destination buffer. */ 
00411       *pOut++ = (q31_t) (sum >> 31); 
00412  
00413       /* Update the inputA and inputB pointers for next MAC calculation */ 
00414       px = pIn1 + count; 
00415       py = pSrc2; 
00416  
00417       /* Increment the MAC count */ 
00418       count++; 
00419  
00420       /* Decrement the loop counter */ 
00421       blkCnt--; 
00422     } 
00423   } 
00424   else 
00425   { 
00426     /* If the srcBLen is not a multiple of 4,  
00427      * the blockSize2 loop cannot be unrolled by 4 */ 
00428     blkCnt = blockSize2; 
00429  
00430     while(blkCnt > 0u) 
00431     { 
00432       /* Accumulator is made zero for every iteration */ 
00433       sum = 0; 
00434  
00435       /* srcBLen number of MACS should be performed */ 
00436       k = srcBLen; 
00437  
00438       while(k > 0u) 
00439       { 
00440         /* Perform the multiply-accumulate */ 
00441         sum += (q63_t) * px++ * (*py--); 
00442  
00443         /* Decrement the loop counter */ 
00444         k--; 
00445       } 
00446  
00447       /* Store the result in the accumulator in the destination buffer. */ 
00448       *pOut++ = (q31_t) (sum >> 31); 
00449  
00450       /* Update the inputA and inputB pointers for next MAC calculation */ 
00451       px = pIn1 + count; 
00452       py = pSrc2; 
00453  
00454       /* Increment the MAC count */ 
00455       count++; 
00456  
00457       /* Decrement the loop counter */ 
00458       blkCnt--; 
00459     } 
00460   } 
00461  
00462  
00463   /* --------------------------  
00464    * Initializations of stage3  
00465    * -------------------------*/ 
00466  
00467   /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]  
00468    * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]  
00469    * ....  
00470    * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]  
00471    * sum +=  x[srcALen-1] * y[srcBLen-1]  
00472    */ 
00473  
00474   /* In this stage the MAC operations are decreased by 1 for every iteration.  
00475      The blockSize3 variable holds the number of MAC operations performed */ 
00476  
00477   /* Working pointer of inputA */ 
00478   pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 
00479   px = pSrc1; 
00480  
00481   /* Working pointer of inputB */ 
00482   pSrc2 = pIn2 + (srcBLen - 1u); 
00483   py = pSrc2; 
00484  
00485   /* -------------------  
00486    * Stage3 process  
00487    * ------------------*/ 
00488  
00489   while(blockSize3 > 0u) 
00490   { 
00491     /* Accumulator is made zero for every iteration */ 
00492     sum = 0; 
00493  
00494     /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00495     k = blockSize3 >> 2u; 
00496  
00497     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00498      ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00499     while(k > 0u) 
00500     { 
00501       /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ 
00502       sum += (q63_t) * px++ * (*py--); 
00503       /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ 
00504       sum += (q63_t) * px++ * (*py--); 
00505       /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ 
00506       sum += (q63_t) * px++ * (*py--); 
00507       /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ 
00508       sum += (q63_t) * px++ * (*py--); 
00509  
00510       /* Decrement the loop counter */ 
00511       k--; 
00512     } 
00513  
00514     /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.  
00515      ** No loop unrolling is used. */ 
00516     k = blockSize3 % 0x4u; 
00517  
00518     while(k > 0u) 
00519     { 
00520       /* Perform the multiply-accumulate */ 
00521       sum += (q63_t) * px++ * (*py--); 
00522  
00523       /* Decrement the loop counter */ 
00524       k--; 
00525     } 
00526  
00527     /* Store the result in the accumulator in the destination buffer. */ 
00528     *pOut++ = (q31_t) (sum >> 31); 
00529  
00530     /* Update the inputA and inputB pointers for next MAC calculation */ 
00531     px = ++pSrc1; 
00532     py = pSrc2; 
00533  
00534     /* Decrement the loop counter */ 
00535     blockSize3--; 
00536   } 
00537  
00538 } 
00539  
00540 /**  
00541  * @} end of Conv group  
00542  */