CMSIS DSP Library from CMSIS 2.0. See http://www.onarm.com/cmsis/ for full details

Dependents:   K22F_DSP_Matrix_least_square BNO055-ELEC3810 1BNO055 ECE4180Project--Slave2 ... more

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_conv_partial_fast_q31.c Source File

arm_conv_partial_fast_q31.c

00001 /* ----------------------------------------------------------------------  
00002 * Copyright (C) 2010 ARM Limited. All rights reserved.  
00003 *  
00004 * $Date:        29. November 2010  
00005 * $Revision:    V1.0.3  
00006 *  
00007 * Project:      CMSIS DSP Library  
00008 * Title:        arm_conv_partial_fast_q31.c  
00009 *  
00010 * Description:  Fast Q31 Partial convolution.  
00011 *  
00012 * Target Processor: Cortex-M4/Cortex-M3
00013 *  
00014 * Version 1.0.3 2010/11/29 
00015 *    Re-organized the CMSIS folders and updated documentation.  
00016 *   
00017 * Version 1.0.2 2010/11/11  
00018 *    Documentation updated.   
00019 *  
00020 * Version 1.0.1 2010/10/05   
00021 *    Production release and review comments incorporated.  
00022 *  
00023 * Version 1.0.0 2010/09/20   
00024 *    Production release and review comments incorporated.  
00025 * -------------------------------------------------------------------- */ 
00026  
00027 #include "arm_math.h" 
00028  
00029 /**  
00030  * @ingroup groupFilters  
00031  */ 
00032  
00033 /**  
00034  * @addtogroup PartialConv  
00035  * @{  
00036  */ 
00037  
00038 /**  
00039  * @brief Partial convolution of Q31 sequences (fast version).  
00040  * @param[in]       *pSrcA points to the first input sequence.  
00041  * @param[in]       srcALen length of the first input sequence.  
00042  * @param[in]       *pSrcB points to the second input sequence.  
00043  * @param[in]       srcBLen length of the second input sequence.  
00044  * @param[out]      *pDst points to the location where the output result is written.  
00045  * @param[in]       firstIndex is the first output sample to start with.  
00046  * @param[in]       numPoints is the number of output points to be computed.  
00047  * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].  
00048  *  
00049  * \par  
00050  * See <code>arm_conv_partial_q31()</code> for a slower implementation of this function which uses a 64-bit accumulator to provide higher precision.  
00051  */ 
00052  
00053 arm_status arm_conv_partial_fast_q31( 
00054   q31_t * pSrcA, 
00055   uint32_t srcALen, 
00056   q31_t * pSrcB, 
00057   uint32_t srcBLen, 
00058   q31_t * pDst, 
00059   uint32_t firstIndex, 
00060   uint32_t numPoints) 
00061 { 
00062   q31_t *pIn1;                                   /* inputA pointer               */ 
00063   q31_t *pIn2;                                   /* inputB pointer               */ 
00064   q31_t *pOut = pDst;                            /* output pointer               */ 
00065   q31_t *px;                                     /* Intermediate inputA pointer  */ 
00066   q31_t *py;                                     /* Intermediate inputB pointer  */ 
00067   q31_t *pSrc1, *pSrc2;                          /* Intermediate pointers        */ 
00068   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulators                  */ 
00069   q31_t x0, x1, x2, x3, c0; 
00070   uint32_t j, k, count, check, blkCnt; 
00071   int32_t blockSize1, blockSize2, blockSize3;    /* loop counters                 */ 
00072   arm_status status;                             /* status of Partial convolution */ 
00073  
00074  
00075   /* Check for range of output samples to be calculated */ 
00076   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 
00077   { 
00078     /* Set status as ARM_MATH_ARGUMENT_ERROR */ 
00079     status = ARM_MATH_ARGUMENT_ERROR; 
00080   } 
00081   else 
00082   { 
00083  
00084     /* The algorithm implementation is based on the lengths of the inputs. */ 
00085     /* srcB is always made to slide across srcA. */ 
00086     /* So srcBLen is always considered as shorter or equal to srcALen */ 
00087     if(srcALen >= srcBLen) 
00088     { 
00089       /* Initialization of inputA pointer */ 
00090       pIn1 = pSrcA; 
00091  
00092       /* Initialization of inputB pointer */ 
00093       pIn2 = pSrcB; 
00094     } 
00095     else 
00096     { 
00097       /* Initialization of inputA pointer */ 
00098       pIn1 = pSrcB; 
00099  
00100       /* Initialization of inputB pointer */ 
00101       pIn2 = pSrcA; 
00102  
00103       /* srcBLen is always considered as shorter or equal to srcALen */ 
00104       j = srcBLen; 
00105       srcBLen = srcALen; 
00106       srcALen = j; 
00107     } 
00108  
00109     /* Conditions to check which loopCounter holds  
00110      * the first and last indices of the output samples to be calculated. */ 
00111     check = firstIndex + numPoints; 
00112     blockSize3 = ((int32_t) check - (int32_t) srcALen); 
00113     blockSize3 = (blockSize3 > 0) ? blockSize3 : 0; 
00114     blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex); 
00115     blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 : 
00116                                     (int32_t) numPoints) : 0; 
00117     blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +  
00118                                     (int32_t) firstIndex); 
00119     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; 
00120  
00121     /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 
00122     /* The function is internally  
00123      * divided into three stages according to the number of multiplications that has to be  
00124      * taken place between inputA samples and inputB samples. In the first stage of the  
00125      * algorithm, the multiplications increase by one for every iteration.  
00126      * In the second stage of the algorithm, srcBLen number of multiplications are done.  
00127      * In the third stage of the algorithm, the multiplications decrease by one  
00128      * for every iteration. */ 
00129  
00130     /* Set the output pointer to point to the firstIndex  
00131      * of the output sample to be calculated. */ 
00132     pOut = pDst + firstIndex; 
00133  
00134     /* --------------------------  
00135      * Initializations of stage1  
00136      * -------------------------*/ 
00137  
00138     /* sum = x[0] * y[0]  
00139      * sum = x[0] * y[1] + x[1] * y[0]  
00140      * ....  
00141      * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]  
00142      */ 
00143  
00144     /* In this stage the MAC operations are increased by 1 for every iteration.  
00145        The count variable holds the number of MAC operations performed.  
00146        Since the partial convolution starts from firstIndex  
00147        Number of Macs to be performed is firstIndex + 1 */ 
00148     count = 1u + firstIndex; 
00149  
00150     /* Working pointer of inputA */ 
00151     px = pIn1; 
00152  
00153     /* Working pointer of inputB */ 
00154     pSrc2 = pIn2 + firstIndex; 
00155     py = pSrc2; 
00156  
00157     /* ------------------------  
00158      * Stage1 process  
00159      * ----------------------*/ 
00160  
00161     /* The first loop starts here */ 
00162     while(blockSize1 > 0) 
00163     { 
00164       /* Accumulator is made zero for every iteration */ 
00165       sum = 0; 
00166  
00167       /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00168       k = count >> 2u; 
00169  
00170       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00171        ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00172       while(k > 0u) 
00173       { 
00174         /* x[0] * y[srcBLen - 1] */ 
00175         sum = (q31_t) ((((q63_t) sum << 32) +  
00176                         ((q63_t) * px++ * (*py--))) >> 32); 
00177  
00178         /* x[1] * y[srcBLen - 2] */ 
00179         sum = (q31_t) ((((q63_t) sum << 32) +  
00180                         ((q63_t) * px++ * (*py--))) >> 32); 
00181  
00182         /* x[2] * y[srcBLen - 3] */ 
00183         sum = (q31_t) ((((q63_t) sum << 32) +  
00184                         ((q63_t) * px++ * (*py--))) >> 32); 
00185  
00186         /* x[3] * y[srcBLen - 4] */ 
00187         sum = (q31_t) ((((q63_t) sum << 32) +  
00188                         ((q63_t) * px++ * (*py--))) >> 32); 
00189  
00190         /* Decrement the loop counter */ 
00191         k--; 
00192       } 
00193  
00194       /* If the count is not a multiple of 4, compute any remaining MACs here.  
00195        ** No loop unrolling is used. */ 
00196       k = count % 0x4u; 
00197  
00198       while(k > 0u) 
00199       { 
00200         /* Perform the multiply-accumulates */ 
00201         sum = (q31_t) ((((q63_t) sum << 32) +  
00202                         ((q63_t) * px++ * (*py--))) >> 32); 
00203  
00204         /* Decrement the loop counter */ 
00205         k--; 
00206       } 
00207  
00208       /* Store the result in the accumulator in the destination buffer. */ 
00209       *pOut++ = sum << 1; 
00210  
00211       /* Update the inputA and inputB pointers for next MAC calculation */ 
00212       py = ++pSrc2; 
00213       px = pIn1; 
00214  
00215       /* Increment the MAC count */ 
00216       count++; 
00217  
00218       /* Decrement the loop counter */ 
00219       blockSize1--; 
00220     } 
00221  
00222     /* --------------------------  
00223      * Initializations of stage2  
00224      * ------------------------*/ 
00225  
00226     /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]  
00227      * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]  
00228      * ....  
00229      * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]  
00230      */ 
00231  
00232     /* Working pointer of inputA */ 
00233     px = pIn1; 
00234  
00235     /* Working pointer of inputB */ 
00236     pSrc2 = pIn2 + (srcBLen - 1u); 
00237     py = pSrc2; 
00238  
00239     /* count is index by which the pointer pIn1 to be incremented */ 
00240     count = 1u; 
00241  
00242     /* -------------------  
00243      * Stage2 process  
00244      * ------------------*/ 
00245  
00246     /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.  
00247      * So, to loop unroll over blockSize2,  
00248      * srcBLen should be greater than or equal to 4 */ 
00249     if(srcBLen >= 4u) 
00250     { 
00251       /* Loop unroll over blockSize2 */ 
00252       blkCnt = ((uint32_t) blockSize2 >> 2u); 
00253  
00254       while(blkCnt > 0u) 
00255       { 
00256         /* Set all accumulators to zero */ 
00257         acc0 = 0; 
00258         acc1 = 0; 
00259         acc2 = 0; 
00260         acc3 = 0; 
00261  
00262         /* read x[0], x[1], x[2] samples */ 
00263         x0 = *(px++); 
00264         x1 = *(px++); 
00265         x2 = *(px++); 
00266  
00267         /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00268         k = srcBLen >> 2u; 
00269  
00270         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00271          ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00272         do 
00273         { 
00274           /* Read y[srcBLen - 1] sample */ 
00275           c0 = *(py--); 
00276  
00277           /* Read x[3] sample */ 
00278           x3 = *(px++); 
00279  
00280           /* Perform the multiply-accumulate */ 
00281           /* acc0 +=  x[0] * y[srcBLen - 1] */ 
00282           acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32); 
00283  
00284           /* acc1 +=  x[1] * y[srcBLen - 1] */ 
00285           acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32); 
00286  
00287           /* acc2 +=  x[2] * y[srcBLen - 1] */ 
00288           acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32); 
00289  
00290           /* acc3 +=  x[3] * y[srcBLen - 1] */ 
00291           acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32); 
00292  
00293           /* Read y[srcBLen - 2] sample */ 
00294           c0 = *(py--); 
00295  
00296           /* Read x[4] sample */ 
00297           x0 = *(px++); 
00298  
00299           /* Perform the multiply-accumulate */ 
00300           /* acc0 +=  x[1] * y[srcBLen - 2] */ 
00301           acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32); 
00302           /* acc1 +=  x[2] * y[srcBLen - 2] */ 
00303           acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32); 
00304           /* acc2 +=  x[3] * y[srcBLen - 2] */ 
00305           acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32); 
00306           /* acc3 +=  x[4] * y[srcBLen - 2] */ 
00307           acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32); 
00308  
00309           /* Read y[srcBLen - 3] sample */ 
00310           c0 = *(py--); 
00311  
00312           /* Read x[5] sample */ 
00313           x1 = *(px++); 
00314  
00315           /* Perform the multiply-accumulates */ 
00316           /* acc0 +=  x[2] * y[srcBLen - 3] */ 
00317           acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32); 
00318           /* acc1 +=  x[3] * y[srcBLen - 2] */ 
00319           acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32); 
00320           /* acc2 +=  x[4] * y[srcBLen - 2] */ 
00321           acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32); 
00322           /* acc3 +=  x[5] * y[srcBLen - 2] */ 
00323           acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32); 
00324  
00325           /* Read y[srcBLen - 4] sample */ 
00326           c0 = *(py--); 
00327  
00328           /* Read x[6] sample */ 
00329           x2 = *(px++); 
00330  
00331           /* Perform the multiply-accumulates */ 
00332           /* acc0 +=  x[3] * y[srcBLen - 4] */ 
00333           acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32); 
00334           /* acc1 +=  x[4] * y[srcBLen - 4] */ 
00335           acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32); 
00336           /* acc2 +=  x[5] * y[srcBLen - 4] */ 
00337           acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32); 
00338           /* acc3 +=  x[6] * y[srcBLen - 4] */ 
00339           acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32); 
00340  
00341  
00342         } while(--k); 
00343  
00344         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.  
00345          ** No loop unrolling is used. */ 
00346         k = srcBLen % 0x4u; 
00347  
00348         while(k > 0u) 
00349         { 
00350           /* Read y[srcBLen - 5] sample */ 
00351           c0 = *(py--); 
00352  
00353           /* Read x[7] sample */ 
00354           x3 = *(px++); 
00355  
00356           /* Perform the multiply-accumulates */ 
00357           /* acc0 +=  x[4] * y[srcBLen - 5] */ 
00358           acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32); 
00359           /* acc1 +=  x[5] * y[srcBLen - 5] */ 
00360           acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32); 
00361           /* acc2 +=  x[6] * y[srcBLen - 5] */ 
00362           acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32); 
00363           /* acc3 +=  x[7] * y[srcBLen - 5] */ 
00364           acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32); 
00365  
00366           /* Reuse the present samples for the next MAC */ 
00367           x0 = x1; 
00368           x1 = x2; 
00369           x2 = x3; 
00370  
00371           /* Decrement the loop counter */ 
00372           k--; 
00373         } 
00374  
00375         /* Store the result in the accumulator in the destination buffer. */ 
00376         *pOut++ = (q31_t) (acc0 << 1); 
00377         *pOut++ = (q31_t) (acc1 << 1); 
00378         *pOut++ = (q31_t) (acc2 << 1); 
00379         *pOut++ = (q31_t) (acc3 << 1); 
00380  
00381         /* Update the inputA and inputB pointers for next MAC calculation */ 
00382         px = pIn1 + (count * 4u); 
00383         py = pSrc2; 
00384  
00385         /* Increment the pointer pIn1 index, count by 1 */ 
00386         count++; 
00387  
00388         /* Decrement the loop counter */ 
00389         blkCnt--; 
00390       } 
00391  
00392       /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.  
00393        ** No loop unrolling is used. */ 
00394       blkCnt = (uint32_t) blockSize2 % 0x4u; 
00395  
00396       while(blkCnt > 0u) 
00397       { 
00398         /* Accumulator is made zero for every iteration */ 
00399         sum = 0; 
00400  
00401         /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00402         k = srcBLen >> 2u; 
00403  
00404         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00405          ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00406         while(k > 0u) 
00407         { 
00408           /* Perform the multiply-accumulates */ 
00409           sum = (q31_t) ((((q63_t) sum << 32) + 
00410                           ((q63_t) * px++ * (*py--))) >> 32); 
00411           sum = (q31_t) ((((q63_t) sum << 32) + 
00412                           ((q63_t) * px++ * (*py--))) >> 32); 
00413           sum = (q31_t) ((((q63_t) sum << 32) + 
00414                           ((q63_t) * px++ * (*py--))) >> 32); 
00415           sum = (q31_t) ((((q63_t) sum << 32) + 
00416                           ((q63_t) * px++ * (*py--))) >> 32); 
00417  
00418           /* Decrement the loop counter */ 
00419           k--; 
00420         } 
00421  
00422         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.  
00423          ** No loop unrolling is used. */ 
00424         k = srcBLen % 0x4u; 
00425  
00426         while(k > 0u) 
00427         { 
00428           /* Perform the multiply-accumulate */ 
00429           sum = (q31_t) ((((q63_t) sum << 32) + 
00430                           ((q63_t) * px++ * (*py--))) >> 32); 
00431  
00432           /* Decrement the loop counter */ 
00433           k--; 
00434         } 
00435  
00436         /* Store the result in the accumulator in the destination buffer. */ 
00437         *pOut++ = sum << 1; 
00438  
00439         /* Update the inputA and inputB pointers for next MAC calculation */ 
00440         px = pIn1 + count; 
00441         py = pSrc2; 
00442  
00443         /* Increment the MAC count */ 
00444         count++; 
00445  
00446         /* Decrement the loop counter */ 
00447         blkCnt--; 
00448       } 
00449     } 
00450     else 
00451     { 
00452       /* If the srcBLen is not a multiple of 4,  
00453        * the blockSize2 loop cannot be unrolled by 4 */ 
00454       blkCnt = (uint32_t) blockSize2; 
00455  
00456       while(blkCnt > 0u) 
00457       { 
00458         /* Accumulator is made zero for every iteration */ 
00459         sum = 0; 
00460  
00461         /* srcBLen number of MACS should be performed */ 
00462         k = srcBLen; 
00463  
00464         while(k > 0u) 
00465         { 
00466           /* Perform the multiply-accumulate */ 
00467           sum = (q31_t) ((((q63_t) sum << 32) + 
00468                           ((q63_t) * px++ * (*py--))) >> 32); 
00469  
00470           /* Decrement the loop counter */ 
00471           k--; 
00472         } 
00473  
00474         /* Store the result in the accumulator in the destination buffer. */ 
00475         *pOut++ = sum << 1; 
00476  
00477         /* Update the inputA and inputB pointers for next MAC calculation */ 
00478         px = pIn1 + count; 
00479         py = pSrc2; 
00480  
00481         /* Increment the MAC count */ 
00482         count++; 
00483  
00484         /* Decrement the loop counter */ 
00485         blkCnt--; 
00486       } 
00487     } 
00488  
00489  
00490     /* --------------------------  
00491      * Initializations of stage3  
00492      * -------------------------*/ 
00493  
00494     /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]  
00495      * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]  
00496      * ....  
00497      * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]  
00498      * sum +=  x[srcALen-1] * y[srcBLen-1]  
00499      */ 
00500  
00501     /* In this stage the MAC operations are decreased by 1 for every iteration.  
00502        The count variable holds the number of MAC operations performed */ 
00503     count = srcBLen - 1u; 
00504  
00505     /* Working pointer of inputA */ 
00506     pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 
00507     px = pSrc1; 
00508  
00509     /* Working pointer of inputB */ 
00510     pSrc2 = pIn2 + (srcBLen - 1u); 
00511     py = pSrc2; 
00512  
00513     /* -------------------  
00514      * Stage3 process  
00515      * ------------------*/ 
00516  
00517     while(blockSize3 > 0) 
00518     { 
00519       /* Accumulator is made zero for every iteration */ 
00520       sum = 0; 
00521  
00522       /* Apply loop unrolling and compute 4 MACs simultaneously. */ 
00523       k = count >> 2u; 
00524  
00525       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.  
00526        ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 
00527       while(k > 0u) 
00528       { 
00529         /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ 
00530         sum = (q31_t) ((((q63_t) sum << 32) +  
00531                         ((q63_t) * px++ * (*py--))) >> 32); 
00532  
00533         /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ 
00534         sum = (q31_t) ((((q63_t) sum << 32) +  
00535                         ((q63_t) * px++ * (*py--))) >> 32); 
00536  
00537         /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ 
00538         sum = (q31_t) ((((q63_t) sum << 32) +  
00539                         ((q63_t) * px++ * (*py--))) >> 32); 
00540  
00541         /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ 
00542         sum = (q31_t) ((((q63_t) sum << 32) +  
00543                         ((q63_t) * px++ * (*py--))) >> 32); 
00544  
00545         /* Decrement the loop counter */ 
00546         k--; 
00547       } 
00548  
00549       /* If the count is not a multiple of 4, compute any remaining MACs here.  
00550        ** No loop unrolling is used. */ 
00551       k = count % 0x4u; 
00552  
00553       while(k > 0u) 
00554       { 
00555         /* Perform the multiply-accumulates */ 
00556         /* sum +=  x[srcALen-1] * y[srcBLen-1] */ 
00557         sum = (q31_t) ((((q63_t) sum << 32) +  
00558                         ((q63_t) * px++ * (*py--))) >> 32); 
00559  
00560         /* Decrement the loop counter */ 
00561         k--; 
00562       } 
00563  
00564       /* Store the result in the accumulator in the destination buffer. */ 
00565       *pOut++ = sum << 1; 
00566  
00567       /* Update the inputA and inputB pointers for next MAC calculation */ 
00568       px = ++pSrc1; 
00569       py = pSrc2; 
00570  
00571       /* Decrement the MAC count */ 
00572       count--; 
00573  
00574       /* Decrement the loop counter */ 
00575       blockSize3--; 
00576  
00577     } 
00578  
00579     /* set status as ARM_MATH_SUCCESS */ 
00580     status = ARM_MATH_SUCCESS; 
00581   } 
00582  
00583   /* Return to application */ 
00584   return (status); 
00585  
00586 } 
00587  
00588 /**  
00589  * @} end of PartialConv group  
00590  */