CMSIS DSP Library from CMSIS 2.0. See http://www.onarm.com/cmsis/ for full details

Dependents:   K22F_DSP_Matrix_least_square BNO055-ELEC3810 1BNO055 ECE4180Project--Slave2 ... more

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_conv_partial_q31.c Source File

arm_conv_partial_q31.c

00001 /* ----------------------------------------------------------------------  
00002 * Copyright (C) 2010 ARM Limited. All rights reserved.  
00003 *  
00004 * $Date:        29. November 2010  
00005 * $Revision:    V1.0.3  
00006 *  
00007 * Project:      CMSIS DSP Library  
00008 * Title:        arm_conv_partial_q31.c  
00009 *  
00010 * Description:  Q31 Partial convolution.  
00011 *  
00012 * Target Processor: Cortex-M4/Cortex-M3
00013 *  
00014 * Version 1.0.3 2010/11/29 
00015 *    Re-organized the CMSIS folders and updated documentation.  
00016 *   
00017 * Version 1.0.2 2010/11/11  
00018 *    Documentation updated.   
00019 *  
00020 * Version 1.0.1 2010/10/05   
00021 *    Production release and review comments incorporated.  
00022 *  
00023 * Version 1.0.0 2010/09/20   
00024 *    Production release and review comments incorporated  
00025 *  
00026 * Version 0.0.7  2010/06/10   
00027 *    Misra-C changes done  
00028 *  
00029 * -------------------------------------------------------------------- */ 
00030  
00031 #include "arm_math.h" 
00032  
00033 /**  
00034  * @ingroup groupFilters  
00035  */ 
00036  
00037 /**  
00038  * @addtogroup PartialConv  
00039  * @{  
00040  */ 
00041  
00042 /**  
00043  * @brief Partial convolution of Q31 sequences.  
00044  * @param[in]       *pSrcA points to the first input sequence.  
00045  * @param[in]       srcALen length of the first input sequence.  
00046  * @param[in]       *pSrcB points to the second input sequence.  
00047  * @param[in]       srcBLen length of the second input sequence.  
00048  * @param[out]      *pDst points to the location where the output result is written.  
00049  * @param[in]       firstIndex is the first output sample to start with.  
00050  * @param[in]       numPoints is the number of output points to be computed.  
00051  * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].  
00052  *  
00053  * See <code>arm_conv_partial_fast_q31()</code> for a faster but less precise implementation of this function.  
00054  */ 
00055  
00056 arm_status arm_conv_partial_q31( 
00057   q31_t * pSrcA, 
00058   uint32_t srcALen, 
00059   q31_t * pSrcB, 
00060   uint32_t srcBLen, 
00061   q31_t * pDst, 
00062   uint32_t firstIndex, 
00063   uint32_t numPoints) 
00064 { 
00065   q31_t *pIn1;                                   /* inputA pointer               */ 
00066   q31_t *pIn2;                                   /* inputB pointer               */ 
00067   q31_t *pOut = pDst;                            /* output pointer               */ 
00068   q31_t *px;                                     /* Intermediate inputA pointer  */ 
00069   q31_t *py;                                     /* Intermediate inputB pointer  */ 
00070   q31_t *pSrc1, *pSrc2;                          /* Intermediate pointers        */ 
00071   q63_t sum, acc0, acc1, acc2, acc3;             /* Accumulator                  */ 
00072   q31_t x0, x1, x2, x3, c0; 
00073   uint32_t j, k, count, check, blkCnt; 
00074   int32_t blockSize1, blockSize2, blockSize3;    /* loop counter                 */ 
00075   arm_status status;                             /* status of Partial convolution */ 
00076  
00077  
00078   /* Check for range of output samples to be calculated */ 
00079   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 
00080   { 
00081     /* Set status as ARM_MATH_ARGUMENT_ERROR */ 
00082     status = ARM_MATH_ARGUMENT_ERROR; 
00083   } 
00084   else 
00085   { 
00086  
00087     /* The algorithm implementation is based on the lengths of the inputs. */ 
00088     /* srcB is always made to slide across srcA. */ 
00089     /* So srcBLen is always considered as shorter or equal to srcALen */ 
00090     if(srcALen >= srcBLen) 
00091     { 
00092       /* Initialization of inputA pointer */ 
00093       pIn1 = pSrcA; 
00094  
00095       /* Initialization of inputB pointer */ 
00096       pIn2 = pSrcB; 
00097     } 
00098     else 
00099     { 
00100       /* Initialization of inputA pointer */ 
00101       pIn1 = pSrcB; 
00102  
00103       /* Initialization of inputB pointer */ 
00104       pIn2 = pSrcA; 
00105  
00106       /* srcBLen is always considered as shorter or equal to srcALen */ 
00107       j = srcBLen; 
00108       srcBLen = srcALen; 
00109       srcALen = j; 
00110     } 
00111  
00112     /* Conditions to check which loopCounter holds  
00113      * the first and last indices of the output samples to be calculated. */ 
00114     check = firstIndex + numPoints; 
00115     blockSize3 = ((int32_t) check - (int32_t) srcALen); 
00116     blockSize3 = (blockSize3 > 0) ? blockSize3 : 0; 
00117     blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex); 
00118     blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 : 
00119                                     (int32_t) numPoints) : 0; 
00120     blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + 
00121                                     (int32_t) firstIndex); 
00122     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; 
00123  
00124     /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 
00125     /* The function is internally  
00126      * divided into three stages according to the number of multiplications that has to be  
00127      * taken place between inputA samples and inputB samples. In the first stage of the  
00128      * algorithm, the multiplications increase by one for every iteration.  
00129      * In the second stage of the algorithm, srcBLen number of multiplications are done.  
00130      * In the third stage of the algorithm, the multiplications decrease by one  
00131      * for every iteration. */ 
00132  
00133     /* Set the output pointer to point to the firstIndex  
00134      * of the output sample to be calculated. */ 
00135     pOut = pDst + firstIndex; 
00136  
00137     /* --------------------------  
00138      * Initializations of stage1  
00139      * -------------------------*/ 
00140  
00141     /* sum = x[0] * y[0]  
00142      * sum = x[0] * y[1] + x[1] * y[0]  
00143      * ....  
00144      * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]  
00145      */ 
00146  
00147     /* In this stage the MAC operations are increased by 1 for every iteration.  
00148        The count variable holds the number of MAC operations performed.  
00149        Since the partial convolution starts from firstIndex  
00150        Number of Macs to be performed is firstIndex + 1 */ 
00151     count = 1u + firstIndex; 
00152  
00153     /* Working pointer of inputA */ 
00154     px = pIn1; 
00155  
00156     /* Working pointer of inputB */ 
00157     pSrc2 = pIn2 + firstIndex; 
00158     py = pSrc2; 
00159  
00160     /* ------------------------  
00161      * Stage1 process  
00162      * ----------------------*/ 
00163  
00164     /* The first loop starts here */ 
00165     while(blockSize1 > 0) 
00166     { 
00167       /* Accumulator is made zero for every iteration */ 
00168       sum = 0; 
00169  
00170       /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00171       k = count >> 2u; 
00172  
00173       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00174        ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00175       while(k > 0u) 
00176       { 
00177         /* x[0] * y[srcBLen - 1] */ 
00178         sum += (q63_t) * px++ * (*py--); 
00179         /* x[1] * y[srcBLen - 2] */ 
00180         sum += (q63_t) * px++ * (*py--); 
00181         /* x[2] * y[srcBLen - 3] */ 
00182         sum += (q63_t) * px++ * (*py--); 
00183         /* x[3] * y[srcBLen - 4] */ 
00184         sum += (q63_t) * px++ * (*py--); 
00185  
00186         /* Decrement the loop counter */ 
00187         k--; 
00188       } 
00189  
00190       /* If the count is not a multiple of 4, compute any remaining MACs here.  
00191        ** No loop unrolling is used. */ 
00192       k = count % 0x4u; 
00193  
00194       while(k > 0u) 
00195       { 
00196         /* Perform the multiply-accumulate */ 
00197         sum += (q63_t) * px++ * (*py--); 
00198  
00199         /* Decrement the loop counter */ 
00200         k--; 
00201       } 
00202  
00203       /* Store the result in the accumulator in the destination buffer. */ 
00204       *pOut++ = (q31_t) (sum >> 31); 
00205  
00206       /* Update the inputA and inputB pointers for next MAC calculation */ 
00207       py = ++pSrc2; 
00208       px = pIn1; 
00209  
00210       /* Increment the MAC count */ 
00211       count++; 
00212  
00213       /* Decrement the loop counter */ 
00214       blockSize1--; 
00215     } 
00216  
00217     /* --------------------------  
00218      * Initializations of stage2  
00219      * ------------------------*/ 
00220  
00221     /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]  
00222      * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]  
00223      * ....  
00224      * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]  
00225      */ 
00226  
00227     /* Working pointer of inputA */ 
00228     px = pIn1; 
00229  
00230     /* Working pointer of inputB */ 
00231     pSrc2 = pIn2 + (srcBLen - 1u); 
00232     py = pSrc2; 
00233  
00234     /* count is index by which the pointer pIn1 to be incremented */ 
00235     count = 1u; 
00236  
00237     /* -------------------  
00238      * Stage2 process  
00239      * ------------------*/ 
00240  
00241     /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.  
00242      * So, to loop unroll over blockSize2,  
00243      * srcBLen should be greater than or equal to 4 */ 
00244     if(srcBLen >= 4u) 
00245     { 
00246       /* Loop unroll over blockSize2 */ 
00247       blkCnt = ((uint32_t) blockSize2 >> 2u); 
00248  
00249       while(blkCnt > 0u) 
00250       { 
00251         /* Set all accumulators to zero */ 
00252         acc0 = 0; 
00253         acc1 = 0; 
00254         acc2 = 0; 
00255         acc3 = 0; 
00256  
00257         /* read x[0], x[1], x[2] samples */ 
00258         x0 = *(px++); 
00259         x1 = *(px++); 
00260         x2 = *(px++); 
00261  
00262         /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00263         k = srcBLen >> 2u; 
00264  
00265         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00266          ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00267         do 
00268         { 
00269           /* Read y[srcBLen - 1] sample */ 
00270           c0 = *(py--); 
00271  
00272           /* Read x[3] sample */ 
00273           x3 = *(px++); 
00274  
00275           /* Perform the multiply-accumulates */ 
00276           /* acc0 +=  x[0] * y[srcBLen - 1] */ 
00277           acc0 += (q63_t) x0 *c0; 
00278           /* acc1 +=  x[1] * y[srcBLen - 1] */ 
00279           acc1 += (q63_t) x1 *c0; 
00280           /* acc2 +=  x[2] * y[srcBLen - 1] */ 
00281           acc2 += (q63_t) x2 *c0; 
00282           /* acc3 +=  x[3] * y[srcBLen - 1] */ 
00283           acc3 += (q63_t) x3 *c0; 
00284  
00285           /* Read y[srcBLen - 2] sample */ 
00286           c0 = *(py--); 
00287  
00288           /* Read x[4] sample */ 
00289           x0 = *(px++); 
00290  
00291           /* Perform the multiply-accumulate */ 
00292           /* acc0 +=  x[1] * y[srcBLen - 2] */ 
00293           acc0 += (q63_t) x1 *c0; 
00294           /* acc1 +=  x[2] * y[srcBLen - 2] */ 
00295           acc1 += (q63_t) x2 *c0; 
00296           /* acc2 +=  x[3] * y[srcBLen - 2] */ 
00297           acc2 += (q63_t) x3 *c0; 
00298           /* acc3 +=  x[4] * y[srcBLen - 2] */ 
00299           acc3 += (q63_t) x0 *c0; 
00300  
00301           /* Read y[srcBLen - 3] sample */ 
00302           c0 = *(py--); 
00303  
00304           /* Read x[5] sample */ 
00305           x1 = *(px++); 
00306  
00307           /* Perform the multiply-accumulates */ 
00308           /* acc0 +=  x[2] * y[srcBLen - 3] */ 
00309           acc0 += (q63_t) x2 *c0; 
00310           /* acc1 +=  x[3] * y[srcBLen - 2] */ 
00311           acc1 += (q63_t) x3 *c0; 
00312           /* acc2 +=  x[4] * y[srcBLen - 2] */ 
00313           acc2 += (q63_t) x0 *c0; 
00314           /* acc3 +=  x[5] * y[srcBLen - 2] */ 
00315           acc3 += (q63_t) x1 *c0; 
00316  
00317           /* Read y[srcBLen - 4] sample */ 
00318           c0 = *(py--); 
00319  
00320           /* Read x[6] sample */ 
00321           x2 = *(px++); 
00322  
00323           /* Perform the multiply-accumulates */ 
00324           /* acc0 +=  x[3] * y[srcBLen - 4] */ 
00325           acc0 += (q63_t) x3 *c0; 
00326           /* acc1 +=  x[4] * y[srcBLen - 4] */ 
00327           acc1 += (q63_t) x0 *c0; 
00328           /* acc2 +=  x[5] * y[srcBLen - 4] */ 
00329           acc2 += (q63_t) x1 *c0; 
00330           /* acc3 +=  x[6] * y[srcBLen - 4] */ 
00331           acc3 += (q63_t) x2 *c0; 
00332  
00333         } while(--k); 
00334  
00335         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.  
00336          ** No loop unrolling is used. */ 
00337         k = srcBLen % 0x4u; 
00338  
00339         while(k > 0u) 
00340         { 
00341           /* Read y[srcBLen - 5] sample */ 
00342           c0 = *(py--); 
00343  
00344           /* Read x[7] sample */ 
00345           x3 = *(px++); 
00346  
00347           /* Perform the multiply-accumulates */ 
00348           /* acc0 +=  x[4] * y[srcBLen - 5] */ 
00349           acc0 += (q63_t) x0 *c0; 
00350           /* acc1 +=  x[5] * y[srcBLen - 5] */ 
00351           acc1 += (q63_t) x1 *c0; 
00352           /* acc2 +=  x[6] * y[srcBLen - 5] */ 
00353           acc2 += (q63_t) x2 *c0; 
00354           /* acc3 +=  x[7] * y[srcBLen - 5] */ 
00355           acc3 += (q63_t) x3 *c0; 
00356  
00357           /* Reuse the present samples for the next MAC */ 
00358           x0 = x1; 
00359           x1 = x2; 
00360           x2 = x3; 
00361  
00362           /* Decrement the loop counter */ 
00363           k--; 
00364         } 
00365  
00366         /* Store the result in the accumulator in the destination buffer. */ 
00367         *pOut++ = (q31_t) (acc0 >> 31); 
00368         *pOut++ = (q31_t) (acc1 >> 31); 
00369         *pOut++ = (q31_t) (acc2 >> 31); 
00370         *pOut++ = (q31_t) (acc3 >> 31); 
00371  
00372         /* Update the inputA and inputB pointers for next MAC calculation */ 
00373         px = pIn1 + (count * 4u); 
00374         py = pSrc2; 
00375  
00376         /* Increment the pointer pIn1 index, count by 1 */ 
00377         count++; 
00378  
00379         /* Decrement the loop counter */ 
00380         blkCnt--; 
00381       } 
00382  
00383       /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.  
00384        ** No loop unrolling is used. */ 
00385       blkCnt = (uint32_t) blockSize2 % 0x4u; 
00386  
00387       while(blkCnt > 0u) 
00388       { 
00389         /* Accumulator is made zero for every iteration */ 
00390         sum = 0; 
00391  
00392         /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00393         k = srcBLen >> 2u; 
00394  
00395         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00396          ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00397         while(k > 0u) 
00398         { 
00399           /* Perform the multiply-accumulates */ 
00400           sum += (q63_t) * px++ * (*py--); 
00401           sum += (q63_t) * px++ * (*py--); 
00402           sum += (q63_t) * px++ * (*py--); 
00403           sum += (q63_t) * px++ * (*py--); 
00404  
00405           /* Decrement the loop counter */ 
00406           k--; 
00407         } 
00408  
00409         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.  
00410          ** No loop unrolling is used. */ 
00411         k = srcBLen % 0x4u; 
00412  
00413         while(k > 0u) 
00414         { 
00415           /* Perform the multiply-accumulate */ 
00416           sum += (q63_t) * px++ * (*py--); 
00417  
00418           /* Decrement the loop counter */ 
00419           k--; 
00420         } 
00421  
00422         /* Store the result in the accumulator in the destination buffer. */ 
00423         *pOut++ = (q31_t) (sum >> 31); 
00424  
00425         /* Update the inputA and inputB pointers for next MAC calculation */ 
00426         px = pIn1 + count; 
00427         py = pSrc2; 
00428  
00429         /* Increment the MAC count */ 
00430         count++; 
00431  
00432         /* Decrement the loop counter */ 
00433         blkCnt--; 
00434       } 
00435     } 
00436     else 
00437     { 
00438       /* If the srcBLen is not a multiple of 4,  
00439        * the blockSize2 loop cannot be unrolled by 4 */ 
00440       blkCnt = (uint32_t) blockSize2; 
00441  
00442       while(blkCnt > 0u) 
00443       { 
00444         /* Accumulator is made zero for every iteration */ 
00445         sum = 0; 
00446  
00447         /* srcBLen number of MACS should be performed */ 
00448         k = srcBLen; 
00449  
00450         while(k > 0u) 
00451         { 
00452           /* Perform the multiply-accumulate */ 
00453           sum += (q63_t) * px++ * (*py--); 
00454  
00455           /* Decrement the loop counter */ 
00456           k--; 
00457         } 
00458  
00459         /* Store the result in the accumulator in the destination buffer. */ 
00460         *pOut++ = (q31_t) (sum >> 31); 
00461  
00462         /* Update the inputA and inputB pointers for next MAC calculation */ 
00463         px = pIn1 + count; 
00464         py = pSrc2; 
00465  
00466         /* Increment the MAC count */ 
00467         count++; 
00468  
00469         /* Decrement the loop counter */ 
00470         blkCnt--; 
00471       } 
00472     } 
00473  
00474  
00475     /* --------------------------  
00476      * Initializations of stage3  
00477      * -------------------------*/ 
00478  
00479     /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]  
00480      * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]  
00481      * ....  
00482      * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]  
00483      * sum +=  x[srcALen-1] * y[srcBLen-1]  
00484      */ 
00485  
00486     /* In this stage the MAC operations are decreased by 1 for every iteration.  
00487        The blockSize3 variable holds the number of MAC operations performed */ 
00488     count = srcBLen - 1u; 
00489  
00490     /* Working pointer of inputA */ 
00491     pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 
00492     px = pSrc1; 
00493  
00494     /* Working pointer of inputB */ 
00495     pSrc2 = pIn2 + (srcBLen - 1u); 
00496     py = pSrc2; 
00497  
00498     /* -------------------  
00499      * Stage3 process  
00500      * ------------------*/ 
00501  
00502     while(blockSize3 > 0) 
00503     { 
00504       /* Accumulator is made zero for every iteration */ 
00505       sum = 0; 
00506  
00507       /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00508       k = count >> 2u; 
00509  
00510       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00511        ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00512       while(k > 0u) 
00513       { 
00514         sum += (q63_t) * px++ * (*py--); 
00515         sum += (q63_t) * px++ * (*py--); 
00516         sum += (q63_t) * px++ * (*py--); 
00517         sum += (q63_t) * px++ * (*py--); 
00518  
00519         /* Decrement the loop counter */ 
00520         k--; 
00521       } 
00522  
00523       /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.  
00524        ** No loop unrolling is used. */ 
00525       k = count % 0x4u; 
00526  
00527       while(k > 0u) 
00528       { 
00529         /* Perform the multiply-accumulate */ 
00530         sum += (q63_t) * px++ * (*py--); 
00531  
00532         /* Decrement the loop counter */ 
00533         k--; 
00534       } 
00535  
00536       /* Store the result in the accumulator in the destination buffer. */ 
00537       *pOut++ = (q31_t) (sum >> 31); 
00538  
00539       /* Update the inputA and inputB pointers for next MAC calculation */ 
00540       px = ++pSrc1; 
00541       py = pSrc2; 
00542  
00543       /* Decrement the MAC count */ 
00544       count--; 
00545  
00546       /* Decrement the loop counter */ 
00547       blockSize3--; 
00548  
00549     } 
00550  
00551     /* set status as ARM_MATH_SUCCESS */ 
00552     status = ARM_MATH_SUCCESS; 
00553   } 
00554  
00555   /* Return to application */ 
00556   return (status); 
00557  
00558 } 
00559  
00560 /**  
00561  * @} end of PartialConv group  
00562  */